200 files changed, 7221 insertions, 1331 deletions
diff --git a/arch/arc/Kbuild b/arch/arc/Kbuild
index b94102fff68b..20ea7dd482d4 100644
--- a/arch/arc/Kbuild
+++ b/arch/arc/Kbuild
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y += kernel/
 obj-y += mm/
+obj-y += net/
 
 # for cleaning
 subdir- += boot
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 4092bec198be..fd0b0a0d4686 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -51,6 +51,7 @@ config ARC
 	select PCI_SYSCALL if PCI
 	select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
 	select TRACE_IRQFLAGS_SUPPORT
+	select HAVE_EBPF_JIT if ISA_ARCV2
 
 config LOCKDEP_SUPPORT
 	def_bool y
diff --git a/arch/arc/net/Makefile b/arch/arc/net/Makefile
new file mode 100644
index 000000000000..ea5790952e9a
--- /dev/null
+++ b/arch/arc/net/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+ifeq ($(CONFIG_ISA_ARCV2),y)
+	obj-$(CONFIG_BPF_JIT) += bpf_jit_core.o
+	obj-$(CONFIG_BPF_JIT) += bpf_jit_arcv2.o
+endif
diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h
new file mode 100644
index 000000000000..ec44873c42d1
--- /dev/null
+++ b/arch/arc/net/bpf_jit.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The interface that a back-end should provide to bpf_jit_core.c.
+ *
+ * Copyright (c) 2024 Synopsys Inc.
+ * Author: Shahab Vahedi <shahab@synopsys.com>
+ */
+
+#ifndef _ARC_BPF_JIT_H
+#define _ARC_BPF_JIT_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* Print debug info and assert. */
+//#define ARC_BPF_JIT_DEBUG
+
+/* Determine the address type of the target. */
+#ifdef CONFIG_ISA_ARCV2
+#define ARC_ADDR u32
+#endif
+
+/*
+ * For the translation of some BPF instructions, a temporary register
+ * might be needed for some interim data.
+ */
+#define JIT_REG_TMP MAX_BPF_JIT_REG
+
+/*
+ * Buffer access: If buffer "b" is not NULL, advance by "n" bytes.
+ *
+ * This macro must be used in any place that potentially requires a
+ * "buf + len". This way, we make sure that the "buf" argument for
+ * the underlying "arc_*(buf, ...)" ends up as NULL instead of something
+ * like "0+4" or "0+8", etc. Those "arc_*()" functions check their "buf"
+ * value to decide if instructions should be emitted or not.
+ */
+#define BUF(b, n) (((b) != NULL) ? ((b) + (n)) : (b))
+
+/************** Functions that the back-end must provide **************/
+/* Extension for 32-bit operations. */
+inline u8 zext(u8 *buf, u8 rd);
+/***** Moves *****/
+u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext);
+u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm);
+u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext);
+u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm);
+u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi);
+/***** Loads and stores *****/
+u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext);
+u8 store_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size);
+u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size);
+/***** Addition *****/
+u8 add_r32(u8 *buf, u8 rd, u8 rs);
+u8 add_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 add_r64(u8 *buf, u8 rd, u8 rs);
+u8 add_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Subtraction *****/
+u8 sub_r32(u8 *buf, u8 rd, u8 rs);
+u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 sub_r64(u8 *buf, u8 rd, u8 rs);
+u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Multiplication *****/
+u8 mul_r32(u8 *buf, u8 rd, u8 rs);
+u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 mul_r64(u8 *buf, u8 rd, u8 rs);
+u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Division *****/
+u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext);
+u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext);
+/***** Remainder *****/
+u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext);
+u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext);
+/***** Bitwise AND *****/
+u8 and_r32(u8 *buf, u8 rd, u8 rs);
+u8 and_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 and_r64(u8 *buf, u8 rd, u8 rs);
+u8 and_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise OR *****/
+u8 or_r32(u8 *buf, u8 rd, u8 rs);
+u8 or_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 or_r64(u8 *buf, u8 rd, u8 rs);
+u8 or_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise XOR *****/
+u8 xor_r32(u8 *buf, u8 rd, u8 rs);
+u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 xor_r64(u8 *buf, u8 rd, u8 rs);
+u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise Negate *****/
+u8 neg_r32(u8 *buf, u8 r);
+u8 neg_r64(u8 *buf, u8 r);
+/***** Bitwise left shift *****/
+u8 lsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 lsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise right shift (logical) *****/
+u8 rsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 rsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise right shift (arithmetic) *****/
+u8 arsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 arsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Frame related *****/
+u32 mask_for_used_regs(u8 bpf_reg, bool is_call);
+u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size);
+u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size);
+/***** Jumps *****/
+/*
+ * Different sorts of conditions (ARC enum as opposed to BPF_*).
+ *
+ * Do not change the order of enums here. ARC_CC_SLE+1 is used
+ * to determine the number of JCCs.
+ */
+enum ARC_CC {
+	ARC_CC_UGT = 0,		/* unsigned >  */
+	ARC_CC_UGE,		/* unsigned >= */
+	ARC_CC_ULT,		/* unsigned <  */
+	ARC_CC_ULE,		/* unsigned <= */
+	ARC_CC_SGT,		/*   signed >  */
+	ARC_CC_SGE,		/*   signed >= */
+	ARC_CC_SLT,		/*   signed <  */
+	ARC_CC_SLE,		/*   signed <= */
+	ARC_CC_AL,		/* always      */
+	ARC_CC_EQ,		/*          == */
+	ARC_CC_NE,		/*          != */
+	ARC_CC_SET,		/* test        */
+	ARC_CC_LAST
+};
+
+/*
+ * A few notes:
+ *
+ * - check_jmp_*() are prerequisites before calling the gen_jmp_*().
+ *   They return "true" if the jump is possible and "false" otherwise.
+ *
+ * - The notion of "*_off" is to emphasize that these parameters are
+ *   merely offsets in the JIT stream and not absolute addresses. One
+ *   can look at them as addresses if the JIT code would start from
+ *   address 0x0000_0000. Nonetheless, since the buffer address for the
+ *   JIT is on a word-aligned address, this works and actually makes
+ *   things simpler (offsets are in the range of u32 which is more than
+ *   enough).
+ */
+bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond);
+bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond);
+u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off);
+u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off);
+/***** Miscellaneous *****/
+u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func);
+u8 arc_to_bpf_return(u8 *buf);
+/*
+ * - Perform byte swaps on "rd" based on the "size".
+ * - If "force" is set, do it unconditionally. Otherwise, consider the
+ *   desired "endian"ness and the host endianness.
+ * - For data "size"s up to 32 bits, perform a zero-extension if asked
+ *   by the "do_zext" boolean.
+ */
+u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext);
+
+#endif /* _ARC_BPF_JIT_H */
diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c
new file mode 100644
index 000000000000..31bfb6e9ce00
--- /dev/null
+++ b/arch/arc/net/bpf_jit_arcv2.c
@@ -0,0 +1,3005 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The ARCv2 backend of Just-In-Time compiler for eBPF bytecode.
+ *
+ * Copyright (c) 2024 Synopsys Inc.
+ * Author: Shahab Vahedi <shahab@synopsys.com>
+ */
+#include <linux/bug.h>
+#include "bpf_jit.h"
+
+/* ARC core registers. */
+enum {
+	ARC_R_0,  ARC_R_1,  ARC_R_2,  ARC_R_3,  ARC_R_4,  ARC_R_5,
+	ARC_R_6,  ARC_R_7,  ARC_R_8,  ARC_R_9,  ARC_R_10, ARC_R_11,
+	ARC_R_12, ARC_R_13, ARC_R_14, ARC_R_15, ARC_R_16, ARC_R_17,
+	ARC_R_18, ARC_R_19, ARC_R_20, ARC_R_21, ARC_R_22, ARC_R_23,
+	ARC_R_24, ARC_R_25, ARC_R_26, ARC_R_FP, ARC_R_SP, ARC_R_ILINK,
+	ARC_R_30, ARC_R_BLINK,
+	/*
+	 * Having ARC_R_IMM encoded as source register means there is an
+	 * immediate that must be interpreted from the next 4 bytes. If
+	 * encoded as the destination register though, it implies that the
+	 * output of the operation is not assigned to any register. The
+	 * latter is helpful if we only care about updating the CPU status
+	 * flags.
+	 */
+	ARC_R_IMM = 62
+};
+
+/*
+ * Remarks about the rationale behind the chosen mapping:
+ *
+ * - BPF_REG_{1,2,3,4} are the argument registers and must be mapped to
+ *   argument registers in ARCv2 ABI: r0-r7. The r7 registers is the last
+ *   argument register in the ABI. Therefore BPF_REG_5, as the fifth
+ *   argument, must be pushed onto the stack. This is a must for calling
+ *   in-kernel functions.
+ *
+ * - In ARCv2 ABI, the return value is in r0 for 32-bit results and (r1,r0)
+ *   for 64-bit results. However, because they're already used for BPF_REG_1,
+ *   the next available scratch registers, r8 and r9, are the best candidates
+ *   for BPF_REG_0. After a "call" to a(n) (in-kernel) function, the result
+ *   is "mov"ed to these registers. At a BPF_EXIT, their value is "mov"ed to
+ *   (r1,r0).
+ *   It is worth mentioning that scratch registers are the best choice for
+ *   BPF_REG_0, because it is very popular in BPF instruction encoding.
+ *
+ * - JIT_REG_TMP is an artifact needed to translate some BPF instructions.
+ *   Its life span is one single BPF instruction. Since during the
+ *   analyze_reg_usage(), it is not known if temporary registers are used,
+ *   it is mapped to ARC's scratch registers: r10 and r11. Therefore, they
+ *   don't matter in analysing phase and don't need saving. This temporary
+ *   register is added as yet another index in the bpf2arc array, so it will
+ *   unfold like the rest of registers during the code generation process.
+ *
+ * - Mapping of callee-saved BPF registers, BPF_REG_{6,7,8,9}, starts from
+ *   (r15,r14) register pair. The (r13,r12) is not a good choice, because
+ *   in ARCv2 ABI, r12 is not a callee-saved register and this can cause
+ *   problem when calling an in-kernel function. Theoretically, the mapping
+ *   could start from (r14,r13), but it is not a conventional ARCv2 register
+ *   pair. To have a future proof design, I opted for this arrangement.
+ *   If/when we decide to add ARCv2 instructions that do use register pairs,
+ *   the mapping, hopefully, doesn't need to be revisited.
+ */
+const u8 bpf2arc[][2] = {
+	/* Return value from in-kernel function, and exit value from eBPF */
+	[BPF_REG_0] = {ARC_R_8, ARC_R_9},
+	/* Arguments from eBPF program to in-kernel function */
+	[BPF_REG_1] = {ARC_R_0, ARC_R_1},
+	[BPF_REG_2] = {ARC_R_2, ARC_R_3},
+	[BPF_REG_3] = {ARC_R_4, ARC_R_5},
+	[BPF_REG_4] = {ARC_R_6, ARC_R_7},
+	/* Remaining arguments, to be passed on the stack per 32-bit ABI */
+	[BPF_REG_5] = {ARC_R_22, ARC_R_23},
+	/* Callee-saved registers that in-kernel function will preserve */
+	[BPF_REG_6] = {ARC_R_14, ARC_R_15},
+	[BPF_REG_7] = {ARC_R_16, ARC_R_17},
+	[BPF_REG_8] = {ARC_R_18, ARC_R_19},
+	[BPF_REG_9] = {ARC_R_20, ARC_R_21},
+	/* Read-only frame pointer to access the eBPF stack. 32-bit only. */
+	[BPF_REG_FP] = {ARC_R_FP, },
+	/* Register for blinding constants */
+	[BPF_REG_AX] = {ARC_R_24, ARC_R_25},
+	/* Temporary registers for internal use */
+	[JIT_REG_TMP] = {ARC_R_10, ARC_R_11}
+};
+
+#define ARC_CALLEE_SAVED_REG_FIRST ARC_R_13
+#define ARC_CALLEE_SAVED_REG_LAST  ARC_R_25
+
+#define REG_LO(r) (bpf2arc[(r)][0])
+#define REG_HI(r) (bpf2arc[(r)][1])
+
+/*
+ * To comply with ARCv2 ABI, BPF's arg5 must be put on stack. After which,
+ * the stack needs to be restored by ARG5_SIZE.
+ */
+#define ARG5_SIZE 8
+
+/* Instruction lengths in bytes. */
+enum {
+	INSN_len_normal = 4,	/* Normal instructions length. */
+	INSN_len_imm = 4	/* Length of an extra 32-bit immediate. */
+};
+
+/* ZZ defines the size of operation in encodings that it is used. */
+enum {
+	ZZ_1_byte = 1,
+	ZZ_2_byte = 2,
+	ZZ_4_byte = 0,
+	ZZ_8_byte = 3
+};
+
+/*
+ * AA is mostly about address write back mode. It determines if the
+ * address in question should be updated before usage or after:
+ *   addr += offset; data = *addr;
+ *   data = *addr; addr += offset;
+ *
+ * In "scaling" mode, the effective address will become the sum
+ * of "address" + "index"*"size". The "size" is specified by the
+ * "ZZ" field. There is no write back when AA is set for scaling:
+ *   data = *(addr + offset<<zz)
+ */
+enum {
+	AA_none  = 0,
+	AA_pre   = 1,	/* in assembly known as "a/aw". */
+	AA_post  = 2,	/* in assembly known as "ab". */
+	AA_scale = 3	/* in assembly known as "as". */
+};
+
+/* X flag determines the mode of extension. */
+enum {
+	X_zero = 0,
+	X_sign = 1
+};
+
+/* Condition codes. */
+enum {
+	CC_always     = 0,	/* condition is true all the time */
+	CC_equal      = 1,	/* if status32.z flag is set */
+	CC_unequal    = 2,	/* if status32.z flag is clear */
+	CC_positive   = 3,	/* if status32.n flag is clear */
+	CC_negative   = 4,	/* if status32.n flag is set */
+	CC_less_u     = 5,	/* less than (unsigned) */
+	CC_less_eq_u  = 14,	/* less than or equal (unsigned) */
+	CC_great_eq_u = 6,	/* greater than or equal (unsigned) */
+	CC_great_u    = 13,	/* greater than (unsigned) */
+	CC_less_s     = 11,	/* less than (signed) */
+	CC_less_eq_s  = 12,	/* less than or equal (signed) */
+	CC_great_eq_s = 10,	/* greater than or equal (signed) */
+	CC_great_s    = 9	/* greater than (signed) */
+};
+
+#define IN_U6_RANGE(x)	((x) <= (0x40      - 1) && (x) >= 0)
+#define IN_S9_RANGE(x)	((x) <= (0x100     - 1) && (x) >= -0x100)
+#define IN_S12_RANGE(x)	((x) <= (0x800     - 1) && (x) >= -0x800)
+#define IN_S21_RANGE(x)	((x) <= (0x100000  - 1) && (x) >= -0x100000)
+#define IN_S25_RANGE(x)	((x) <= (0x1000000 - 1) && (x) >= -0x1000000)
+
+/* Operands in most of the encodings. */
+#define OP_A(x)	((x) & 0x03f)
+#define OP_B(x)	((((x) & 0x07) << 24) | (((x) & 0x38) <<  9))
+#define OP_C(x)	(((x) & 0x03f) << 6)
+#define OP_IMM	(OP_C(ARC_R_IMM))
+#define COND(x)	(OP_A((x) & 31))
+#define FLAG(x)	(((x) & 1) << 15)
+
+/*
+ * The 4-byte encoding of "mov b,c":
+ *
+ * 0010_0bbb 0000_1010 0BBB_cccc cc00_0000
+ *
+ * b:  BBBbbb		destination register
+ * c:  cccccc		source register
+ */
+#define OPC_MOV		0x200a0000
+
+/*
+ * The 4-byte encoding of "mov b,s12" (used for moving small immediates):
+ *
+ * 0010_0bbb 1000_1010 0BBB_ssss ssSS_SSSS
+ *
+ * b:  BBBbbb		destination register
+ * s:  SSSSSSssssss	source immediate (signed)
+ */
+#define OPC_MOVI	0x208a0000
+#define MOVI_S12(x)	((((x) & 0xfc0) >> 6) | (((x) & 0x3f) << 6))
+
+/*
+ * The 4-byte encoding of "mov[.qq] b,u6", used for conditional
+ * moving of even smaller immediates:
+ *
+ * 0010_0bbb 1100_1010 0BBB_cccc cciq_qqqq
+ *
+ * qq: qqqqq		condition code
+ * i:			If set, c is considered a 6-bit immediate, else a reg.
+ *
+ * b:  BBBbbb		destination register
+ * c:  cccccc		source
+ */
+#define OPC_MOV_CC	0x20ca0000
+#define MOV_CC_I	BIT(5)
+#define OPC_MOVU_CC	(OPC_MOV_CC | MOV_CC_I)
+
+/*
+ * The 4-byte encoding of "sexb b,c" (8-bit sign extension):
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_0101
+ *
+ * b:  BBBbbb		destination register
+ * c:  cccccc		source register
+ */
+#define OPC_SEXB	0x202f0005
+
+/*
+ * The 4-byte encoding of "sexh b,c" (16-bit sign extension):
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_0110
+ *
+ * b:  BBBbbb		destination register
+ * c:  cccccc		source register
+ */
+#define OPC_SEXH	0x202f0006
+
+/*
+ * The 4-byte encoding of "ld[zz][.x][.aa] c,[b,s9]":
+ *
+ * 0001_0bbb ssss_ssss SBBB_0aaz zxcc_cccc
+ *
+ * zz:			size mode
+ * aa:			address write back mode
+ * x:			extension mode
+ *
+ * s9: S_ssss_ssss	9-bit signed number
+ * b:  BBBbbb		source reg for address
+ * c:  cccccc		destination register
+ */
+#define OPC_LOAD	0x10000000
+#define LOAD_X(x)	((x) << 6)
+#define LOAD_ZZ(x)	((x) << 7)
+#define LOAD_AA(x)	((x) << 9)
+#define LOAD_S9(x)	((((x) & 0x0ff) << 16) | (((x) & 0x100) <<  7))
+#define LOAD_C(x)	((x) & 0x03f)
+/* Unsigned and signed loads. */
+#define OPC_LDU		(OPC_LOAD | LOAD_X(X_zero))
+#define OPC_LDS		(OPC_LOAD | LOAD_X(X_sign))
+/* 32-bit load. */
+#define OPC_LD32	(OPC_LDU | LOAD_ZZ(ZZ_4_byte))
+/* "pop reg" is merely a "ld.ab reg,[sp,4]". */
+#define OPC_POP		\
+	(OPC_LD32 | LOAD_AA(AA_post) | LOAD_S9(4) | OP_B(ARC_R_SP))
+
+/*
+ * The 4-byte encoding of "st[zz][.aa] c,[b,s9]":
+ *
+ * 0001_1bbb ssss_ssss SBBB_cccc cc0a_azz0
+ *
+ * zz: zz		size mode
+ * aa: aa		address write back mode
+ *
+ * s9: S_ssss_ssss	9-bit signed number
+ * b:  BBBbbb		source reg for address
+ * c:  cccccc		source reg to be stored
+ */
+#define OPC_STORE	0x18000000
+#define STORE_ZZ(x)	((x) << 1)
+#define STORE_AA(x)	((x) << 3)
+#define STORE_S9(x)	((((x) & 0x0ff) << 16) | (((x) & 0x100) <<  7))
+/* 32-bit store. */
+#define OPC_ST32	(OPC_STORE | STORE_ZZ(ZZ_4_byte))
+/* "push reg" is merely a "st.aw reg,[sp,-4]". */
+#define OPC_PUSH	\
+	(OPC_ST32 | STORE_AA(AA_pre) | STORE_S9(-4) | OP_B(ARC_R_SP))
+
+/*
+ * The 4-byte encoding of "add a,b,c":
+ *
+ * 0010_0bbb 0i00_0000 fBBB_cccc ccaa_aaaa
+ *
+ * f:                   indicates if flags (carry, etc.) should be updated
+ * i:			If set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_ADD		0x20000000
+/* Addition with updating the pertinent flags in "status32" register. */
+#define OPC_ADDF	(OPC_ADD | FLAG(1))
+#define ADDI		BIT(22)
+#define ADDI_U6(x)	OP_C(x)
+#define OPC_ADDI	(OPC_ADD | ADDI)
+#define OPC_ADDIF	(OPC_ADDI | FLAG(1))
+#define OPC_ADD_I	(OPC_ADD | OP_IMM)
+
+/*
+ * The 4-byte encoding of "adc a,b,c" (addition with carry):
+ *
+ * 0010_0bbb 0i00_0001 0BBB_cccc ccaa_aaaa
+ *
+ * i:			if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_ADC		0x20010000
+#define ADCI		BIT(22)
+#define ADCI_U6(x)	OP_C(x)
+#define OPC_ADCI	(OPC_ADC | ADCI)
+
+/*
+ * The 4-byte encoding of "sub a,b,c":
+ *
+ * 0010_0bbb 0i00_0010 fBBB_cccc ccaa_aaaa
+ *
+ * f:                   indicates if flags (carry, etc.) should be updated
+ * i:			if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_SUB		0x20020000
+/* Subtraction with updating the pertinent flags in "status32" register. */
+#define OPC_SUBF	(OPC_SUB | FLAG(1))
+#define SUBI		BIT(22)
+#define SUBI_U6(x)	OP_C(x)
+#define OPC_SUBI	(OPC_SUB | SUBI)
+#define OPC_SUB_I	(OPC_SUB | OP_IMM)
+
+/*
+ * The 4-byte encoding of "sbc a,b,c" (subtraction with carry):
+ *
+ * 0010_0bbb 0000_0011 fBBB_cccc ccaa_aaaa
+ *
+ * f:                   indicates if flags (carry, etc.) should be updated
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_SBC		0x20030000
+
+/*
+ * The 4-byte encoding of "cmp[.qq] b,c":
+ *
+ * 0010_0bbb 1100_1100 1BBB_cccc cc0q_qqqq
+ *
+ * qq:	qqqqq		condition code
+ *
+ * b:  BBBbbb		the 1st operand
+ * c:  cccccc		the 2nd operand
+ */
+#define OPC_CMP		0x20cc8000
+
+/*
+ * The 4-byte encoding of "neg a,b":
+ *
+ * 0010_0bbb 0100_1110 0BBB_0000 00aa_aaaa
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		input
+ */
+#define OPC_NEG		0x204e0000
+
+/*
+ * The 4-byte encoding of "mpy a,b,c".
+ * mpy is the signed 32-bit multiplication with the lower 32-bit
+ * of the product as the result.
+ *
+ * 0010_0bbb 0001_1010 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_MPY		0x201a0000
+#define OPC_MPYI	(OPC_MPY | OP_IMM)
+
+/*
+ * The 4-byte encoding of "mpydu a,b,c".
+ * mpydu is the unsigned 32-bit multiplication with the lower 32-bit of
+ * the product in register "a" and the higher 32-bit in register "a+1".
+ *
+ * 0010_1bbb 0001_1001 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		64-bit result in registers (R_a+1,R_a)
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_MPYDU	0x28190000
+#define OPC_MPYDUI	(OPC_MPYDU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "divu a,b,c" (unsigned division):
+ *
+ * 0010_1bbb 0000_0101 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result (quotient)
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand (divisor)
+ */
+#define OPC_DIVU	0x28050000
+#define OPC_DIVUI	(OPC_DIVU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "div a,b,c" (signed division):
+ *
+ * 0010_1bbb 0000_0100 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result (quotient)
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand (divisor)
+ */
+#define OPC_DIVS	0x28040000
+#define OPC_DIVSI	(OPC_DIVS | OP_IMM)
+
+/*
+ * The 4-byte encoding of "remu a,b,c" (unsigned remainder):
+ *
+ * 0010_1bbb 0000_1001 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result (remainder)
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand (divisor)
+ */
+#define OPC_REMU	0x28090000
+#define OPC_REMUI	(OPC_REMU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "rem a,b,c" (signed remainder):
+ *
+ * 0010_1bbb 0000_1000 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result (remainder)
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand (divisor)
+ */
+#define OPC_REMS	0x28080000
+#define OPC_REMSI	(OPC_REMS | OP_IMM)
+
+/*
+ * The 4-byte encoding of "and a,b,c":
+ *
+ * 0010_0bbb 0000_0100 fBBB_cccc ccaa_aaaa
+ *
+ * f:                   indicates if zero and negative flags should be updated
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_AND		0x20040000
+#define OPC_ANDI	(OPC_AND | OP_IMM)
+
+/*
+ * The 4-byte encoding of "tst[.qq] b,c".
+ * Checks if the two input operands have any bit set at the same
+ * position.
+ *
+ * 0010_0bbb 1100_1011 1BBB_cccc cc0q_qqqq
+ *
+ * qq:	qqqqq		condition code
+ *
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_TST		0x20cb8000
+
+/*
+ * The 4-byte encoding of "or a,b,c":
+ *
+ * 0010_0bbb 0000_0101 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_OR		0x20050000
+#define OPC_ORI		(OPC_OR | OP_IMM)
+
+/*
+ * The 4-byte encoding of "xor a,b,c":
+ *
+ * 0010_0bbb 0000_0111 0BBB_cccc ccaa_aaaa
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		the 1st input operand
+ * c:  cccccc		the 2nd input operand
+ */
+#define OPC_XOR		0x20070000
+#define OPC_XORI	(OPC_XOR | OP_IMM)
+
+/*
+ * The 4-byte encoding of "not b,c":
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_1010
+ *
+ * b:  BBBbbb		result
+ * c:  cccccc		input
+ */
+#define OPC_NOT		0x202f000a
+
+/*
+ * The 4-byte encoding of "btst b,u6":
+ *
+ * 0010_0bbb 0101_0001 1BBB_uuuu uu00_0000
+ *
+ * b:  BBBbbb		input number to check
+ * u6: uuuuuu		6-bit unsigned number specifying bit position to check
+ */
+#define OPC_BTSTU6	0x20518000
+#define BTST_U6(x)	(OP_C((x) & 63))
+
+/*
+ * The 4-byte encoding of "asl[.qq] b,b,c" (arithmetic shift left):
+ *
+ * 0010_1bbb 0i00_0000 0BBB_cccc ccaa_aaaa
+ *
+ * i:			if set, c is considered a 5-bit immediate, else a reg.
+ *
+ * b:  BBBbbb		result and the first operand (number to be shifted)
+ * c:  cccccc		amount to be shifted
+ */
+#define OPC_ASL		0x28000000
+#define ASL_I		BIT(22)
+#define ASLI_U6(x)	OP_C((x) & 31)
+#define OPC_ASLI	(OPC_ASL | ASL_I)
+
+/*
+ * The 4-byte encoding of "asr a,b,c" (arithmetic shift right):
+ *
+ * 0010_1bbb 0i00_0010 0BBB_cccc ccaa_aaaa
+ *
+ * i:			if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		first input:  number to be shifted
+ * c:  cccccc		second input: amount to be shifted
+ */
+#define OPC_ASR		0x28020000
+#define ASR_I		ASL_I
+#define ASRI_U6(x)	ASLI_U6(x)
+#define OPC_ASRI	(OPC_ASR | ASR_I)
+
+/*
+ * The 4-byte encoding of "lsr a,b,c" (logical shift right):
+ *
+ * 0010_1bbb 0i00_0001 0BBB_cccc ccaa_aaaa
+ *
+ * i:			if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a:  aaaaaa		result
+ * b:  BBBbbb		first input:  number to be shifted
+ * c:  cccccc		second input: amount to be shifted
+ */
+#define OPC_LSR		0x28010000
+#define LSR_I		ASL_I
+#define LSRI_U6(x)	ASLI_U6(x)
+#define OPC_LSRI	(OPC_LSR | LSR_I)
+
+/*
+ * The 4-byte encoding of "swape b,c":
+ *
+ * 0010_1bbb 0010_1111 0bbb_cccc cc00_1001
+ *
+ * b:  BBBbbb		destination register
+ * c:  cccccc		source register
+ */
+#define OPC_SWAPE	0x282f0009
+
+/*
+ * Encoding for jump to an address in register:
+ * j reg_c
+ *
+ * 0010_0000 1110_0000 0000_cccc cc00_0000
+ *
+ * c:  cccccc		register holding the destination address
+ */
+#define OPC_JMP		0x20e00000
+/* Jump to "branch-and-link" register, which effectively is a "return". */
+#define OPC_J_BLINK	(OPC_JMP | OP_C(ARC_R_BLINK))
+
+/*
+ * Encoding for jump-and-link to an address in register:
+ * jl reg_c
+ *
+ * 0010_0000 0010_0010 0000_cccc cc00_0000
+ *
+ * c:  cccccc		register holding the destination address
+ */
+#define OPC_JL		0x20220000
+
+/*
+ * Encoding for (conditional) branch to an offset from the current location
+ * that is word aligned: (PC & 0xffff_fffc) + s21
+ * B[qq] s21
+ *
+ * 0000_0sss ssss_sss0 SSSS_SSSS SS0q_qqqq
+ *
+ * qq:	qqqqq				condition code
+ * s21:	SSSS SSSS_SSss ssss_ssss	The displacement (21-bit signed)
+ *
+ * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
+ * it should be a multiple of 2. Hence, there is an implied '0' bit at its
+ * LSB: S_SSSS SSSS_Ssss ssss_sss0
+ */
+#define OPC_BCC		0x00000000
+#define BCC_S21(d)	((((d) & 0x7fe) << 16) | (((d) & 0x1ff800) >> 5))
+
+/*
+ * Encoding for unconditional branch to an offset from the current location
+ * that is word aligned: (PC & 0xffff_fffc) + s25
+ * B s25
+ *
+ * 0000_0sss ssss_sss1 SSSS_SSSS SS00_TTTT
+ *
+ * s25:	TTTT SSSS SSSS_SSss ssss_ssss	The displacement (25-bit signed)
+ *
+ * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
+ * it should be a multiple of 2. Hence, there is an implied '0' bit at its
+ * LSB: T TTTS_SSSS SSSS_Ssss ssss_sss0
+ */
+#define OPC_B		0x00010000
+#define B_S25(d)	((((d) & 0x1e00000) >> 21) | BCC_S21(d))
+
+static inline void emit_2_bytes(u8 *buf, u16 bytes)
+{
+	*((u16 *)buf) = bytes;
+}
+
+static inline void emit_4_bytes(u8 *buf, u32 bytes)
+{
+	emit_2_bytes(buf, bytes >> 16);
+	emit_2_bytes(buf + 2, bytes & 0xffff);
+}
+
+static inline u8 bpf_to_arc_size(u8 size)
+{
+	switch (size) {
+	case BPF_B:
+		return ZZ_1_byte;
+	case BPF_H:
+		return ZZ_2_byte;
+	case BPF_W:
+		return ZZ_4_byte;
+	case BPF_DW:
+		return ZZ_8_byte;
+	default:
+		return ZZ_4_byte;
+	}
+}
+
+/************** Encoders (Deal with ARC regs) ************/
+
+/* Move an immediate to register with a 4-byte instruction. */
+static u8 arc_movi_r(u8 *buf, u8 reg, s16 imm)
+{
+	const u32 insn = OPC_MOVI | OP_B(reg) | MOVI_S12(imm);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* rd <- rs */
+static u8 arc_mov_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_MOV | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* The emitted code may have different sizes based on "imm". */
+static u8 arc_mov_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM;
+
+	if (IN_S12_RANGE(imm))
+		return arc_movi_r(buf, rd, imm);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* The emitted code will always have the same size (8). */
+static u8 arc_mov_i_fixed(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM;
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* Conditional move. */
+static u8 arc_mov_cc_r(u8 *buf, u8 cc, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_MOV_CC | OP_B(rd) | OP_C(rs) | COND(cc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* Conditional move of a small immediate to rd. */
+static u8 arc_movu_cc_r(u8 *buf, u8 cc, u8 rd, u8 imm)
+{
+	const u32 insn = OPC_MOVU_CC | OP_B(rd) | OP_C(imm) | COND(cc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* Sign extension from a byte. */
+static u8 arc_sexb_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_SEXB | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* Sign extension from two bytes. */
+static u8 arc_sexh_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_SEXH | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* st reg, [reg_mem, off] */
+static u8 arc_st_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
+{
+	const u32 insn = OPC_STORE | STORE_ZZ(zz) | OP_C(reg) |
+		OP_B(reg_mem) | STORE_S9(off);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* st.aw reg, [sp, -4] */
+static u8 arc_push_r(u8 *buf, u8 reg)
+{
+	const u32 insn = OPC_PUSH | OP_C(reg);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* ld reg, [reg_mem, off] (unsigned) */
+static u8 arc_ld_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
+{
+	const u32 insn = OPC_LDU | LOAD_ZZ(zz) | LOAD_C(reg) |
+		OP_B(reg_mem) | LOAD_S9(off);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* ld.x reg, [reg_mem, off] (sign extend) */
+static u8 arc_ldx_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz)
+{
+	const u32 insn = OPC_LDS | LOAD_ZZ(zz) | LOAD_C(reg) |
+		OP_B(reg_mem) | LOAD_S9(off);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* ld.ab reg,[sp,4] */
+static u8 arc_pop_r(u8 *buf, u8 reg)
+{
+	const u32 insn = OPC_POP | LOAD_C(reg);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* add Ra,Ra,Rc */
+static u8 arc_add_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_ADD | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* add.f Ra,Ra,Rc */
+static u8 arc_addf_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_ADDF | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* add.f Ra,Ra,u6 */
+static u8 arc_addif_r(u8 *buf, u8 ra, u8 u6)
+{
+	const u32 insn = OPC_ADDIF | OP_A(ra) | OP_B(ra) | ADDI_U6(u6);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* add Ra,Ra,u6 */
+static u8 arc_addi_r(u8 *buf, u8 ra, u8 u6)
+{
+	const u32 insn = OPC_ADDI | OP_A(ra) | OP_B(ra) | ADDI_U6(u6);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* add Ra,Rb,imm */
+static u8 arc_add_i(u8 *buf, u8 ra, u8 rb, s32 imm)
+{
+	const u32 insn = OPC_ADD_I | OP_A(ra) | OP_B(rb);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* adc Ra,Ra,Rc */
+static u8 arc_adc_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_ADC | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* adc Ra,Ra,u6 */
+static u8 arc_adci_r(u8 *buf, u8 ra, u8 u6)
+{
+	const u32 insn = OPC_ADCI | OP_A(ra) | OP_B(ra) | ADCI_U6(u6);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* sub Ra,Ra,Rc */
+static u8 arc_sub_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_SUB | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* sub.f Ra,Ra,Rc */
+static u8 arc_subf_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_SUBF | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* sub Ra,Ra,u6 */
+static u8 arc_subi_r(u8 *buf, u8 ra, u8 u6)
+{
+	const u32 insn = OPC_SUBI | OP_A(ra) | OP_B(ra) | SUBI_U6(u6);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* sub Ra,Ra,imm */
+static u8 arc_sub_i(u8 *buf, u8 ra, s32 imm)
+{
+	const u32 insn = OPC_SUB_I | OP_A(ra) | OP_B(ra);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* sbc Ra,Ra,Rc */
+static u8 arc_sbc_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_SBC | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* cmp Rb,Rc */
+static u8 arc_cmp_r(u8 *buf, u8 rb, u8 rc)
+{
+	const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/*
+ * cmp.z Rb,Rc
+ *
+ * This "cmp.z" variant of compare instruction is used on lower
+ * 32-bits of register pairs after "cmp"ing their upper parts. If the
+ * upper parts are equal (z), then this one will proceed to check the
+ * rest.
+ */
+static u8 arc_cmpz_r(u8 *buf, u8 rb, u8 rc)
+{
+	const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc) | CC_equal;
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* neg Ra,Rb */
+static u8 arc_neg_r(u8 *buf, u8 ra, u8 rb)
+{
+	const u32 insn = OPC_NEG | OP_A(ra) | OP_B(rb);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* mpy Ra,Rb,Rc */
+static u8 arc_mpy_r(u8 *buf, u8 ra, u8 rb, u8 rc)
+{
+	const u32 insn = OPC_MPY | OP_A(ra) | OP_B(rb) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* mpy Ra,Rb,imm */
+static u8 arc_mpy_i(u8 *buf, u8 ra, u8 rb, s32 imm)
+{
+	const u32 insn = OPC_MPYI | OP_A(ra) | OP_B(rb);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* mpydu Ra,Ra,Rc */
+static u8 arc_mpydu_r(u8 *buf, u8 ra, u8 rc)
+{
+	const u32 insn = OPC_MPYDU | OP_A(ra) | OP_B(ra) | OP_C(rc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* mpydu Ra,Ra,imm */
+static u8 arc_mpydu_i(u8 *buf, u8 ra, s32 imm)
+{
+	const u32 insn = OPC_MPYDUI | OP_A(ra) | OP_B(ra);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* divu Rd,Rd,Rs */
+static u8 arc_divu_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_DIVU | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* divu Rd,Rd,imm */
+static u8 arc_divu_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_DIVUI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* div Rd,Rd,Rs */
+static u8 arc_divs_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_DIVS | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* div Rd,Rd,imm */
+static u8 arc_divs_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_DIVSI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* remu Rd,Rd,Rs */
+static u8 arc_remu_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_REMU | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* remu Rd,Rd,imm */
+static u8 arc_remu_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_REMUI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* rem Rd,Rd,Rs */
+static u8 arc_rems_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_REMS | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* rem Rd,Rd,imm */
+static u8 arc_rems_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_REMSI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* and Rd,Rd,Rs */
+static u8 arc_and_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_AND | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/* and Rd,Rd,limm */
+static u8 arc_and_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_ANDI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+/* tst Rd,Rs */
+static u8 arc_tst_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/*
+ * This particular version, "tst.z ...", is meant to be used after a
+ * "tst" on the low 32-bit of register pairs. If that "tst" is not
+ * zero, then we don't need to test the upper 32-bits lest it sets
+ * the zero flag.
+ */
+static u8 arc_tstz_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs) | CC_equal;
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_or_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
+{
+	const u32 insn = OPC_OR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_or_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_ORI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+static u8 arc_xor_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_XOR | OP_A(rd) | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_xor_i(u8 *buf, u8 rd, s32 imm)
+{
+	const u32 insn = OPC_XORI | OP_A(rd) | OP_B(rd);
+
+	if (buf) {
+		emit_4_bytes(buf, insn);
+		emit_4_bytes(buf + INSN_len_normal, imm);
+	}
+	return INSN_len_normal + INSN_len_imm;
+}
+
+static u8 arc_not_r(u8 *buf, u8 rd, u8 rs)
+{
+	const u32 insn = OPC_NOT | OP_B(rd) | OP_C(rs);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_btst_i(u8 *buf, u8 rs, u8 imm)
+{
+	const u32 insn = OPC_BTSTU6 | OP_B(rs) | BTST_U6(imm);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_asl_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
+{
+	const u32 insn = OPC_ASL | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_asli_r(u8 *buf, u8 rd, u8 rs, u8 imm)
+{
+	const u32 insn = OPC_ASLI | OP_A(rd) | OP_B(rs) | ASLI_U6(imm);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_asr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
+{
+	const u32 insn = OPC_ASR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_asri_r(u8 *buf, u8 rd, u8 rs, u8 imm)
+{
+	const u32 insn = OPC_ASRI | OP_A(rd) | OP_B(rs) | ASRI_U6(imm);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_lsr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2)
+{
+	const u32 insn = OPC_LSR | OP_A(rd) | OP_B(rs1) | OP_C(rs2);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_lsri_r(u8 *buf, u8 rd, u8 rs, u8 imm)
+{
+	const u32 insn = OPC_LSRI | OP_A(rd) | OP_B(rs) | LSRI_U6(imm);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_swape_r(u8 *buf, u8 r)
+{
+	const u32 insn = OPC_SWAPE | OP_B(r) | OP_C(r);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+static u8 arc_jmp_return(u8 *buf)
+{
+	if (buf)
+		emit_4_bytes(buf, OPC_J_BLINK);
+	return INSN_len_normal;
+}
+
+static u8 arc_jl(u8 *buf, u8 reg)
+{
+	const u32 insn = OPC_JL | OP_C(reg);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/*
+ * Conditional jump to an address that is max 21 bits away (signed).
+ *
+ * b<cc> s21
+ */
+static u8 arc_bcc(u8 *buf, u8 cc, int offset)
+{
+	const u32 insn = OPC_BCC | BCC_S21(offset) | COND(cc);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/*
+ * Unconditional jump to an address that is max 25 bits away (signed).
+ *
+ * b     s25
+ */
+static u8 arc_b(u8 *buf, s32 offset)
+{
+	const u32 insn = OPC_B | B_S25(offset);
+
+	if (buf)
+		emit_4_bytes(buf, insn);
+	return INSN_len_normal;
+}
+
+/************* Packers (Deal with BPF_REGs) **************/
+
+inline u8 zext(u8 *buf, u8 rd)
+{
+	if (rd != BPF_REG_FP)
+		return arc_movi_r(buf, REG_HI(rd), 0);
+	else
+		return 0;
+}
+
+u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext)
+{
+	u8 len = 0;
+
+	if (sign_ext) {
+		if (sign_ext == 8)
+			len = arc_sexb_r(buf, REG_LO(rd), REG_LO(rs));
+		else if (sign_ext == 16)
+			len = arc_sexh_r(buf, REG_LO(rd), REG_LO(rs));
+		else if (sign_ext == 32 && rd != rs)
+			len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
+
+		return len;
+	}
+
+	/* Unsigned move. */
+
+	if (rd != rs)
+		len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
+
+	return len;
+}
+
+u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm)
+{
+	return arc_mov_i(buf, REG_LO(reg), imm);
+}
+
+u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext)
+{
+	u8 len = 0;
+
+	if (sign_ext) {
+		/* First handle the low 32-bit part. */
+		len = mov_r32(buf, rd, rs, sign_ext);
+
+		/* Now propagate the sign bit of LO to HI. */
+		if (sign_ext == 8 || sign_ext == 16 || sign_ext == 32) {
+			len += arc_asri_r(BUF(buf, len),
+					  REG_HI(rd), REG_LO(rd), 31);
+		}
+
+		return len;
+	}
+
+	/* Unsigned move. */
+
+	if (rd == rs)
+		return 0;
+
+	len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs));
+
+	if (rs != BPF_REG_FP)
+		len += arc_mov_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
+	else
+		len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0);
+
+	return len;
+}
+
+/* Sign extend the 32-bit immediate into 64-bit register pair. */
+u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm)
+{
+	u8 len = 0;
+
+	len = arc_mov_i(buf, REG_LO(reg), imm);
+
+	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
+	if (reg != BPF_REG_FP) {
+		if (imm >= 0)
+			len += arc_movi_r(BUF(buf, len), REG_HI(reg), 0);
+		else
+			len += arc_movi_r(BUF(buf, len), REG_HI(reg), -1);
+	}
+
+	return len;
+}
+
+/*
+ * This is merely used for translation of "LD R, IMM64" instructions
+ * of the BPF. These sort of instructions are sometimes used for
+ * relocations. If during the normal pass, the relocation value is
+ * not known, the BPF instruction may look something like:
+ *
+ * LD R <- 0x0000_0001_0000_0001
+ *
+ * Which will nicely translate to two 4-byte ARC instructions:
+ *
+ * mov R_lo, 1               # imm is small enough to be s12
+ * mov R_hi, 1               # same
+ *
+ * However, during the extra pass, the IMM64 will have changed
+ * to the resolved address and looks something like:
+ *
+ * LD R <- 0x0000_0000_1234_5678
+ *
+ * Now, the translated code will require 12 bytes:
+ *
+ * mov R_lo, 0x12345678      # this is an 8-byte instruction
+ * mov R_hi, 0               # still 4 bytes
+ *
+ * Which in practice will result in overwriting the following
+ * instruction. To avoid such cases, we will always emit codes
+ * with fixed sizes.
+ */
+u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi)
+{
+	u8 len;
+
+	len  = arc_mov_i_fixed(buf, REG_LO(reg), lo);
+	len += arc_mov_i_fixed(BUF(buf, len), REG_HI(reg), hi);
+
+	return len;
+}
+
+/*
+ * If the "off"set is too big (doesn't encode as S9) for:
+ *
+ *   {ld,st}  r, [rm, off]
+ *
+ * Then emit:
+ *
+ *   add r10, REG_LO(rm), off
+ *
+ * and make sure that r10 becomes the effective address:
+ *
+ *   {ld,st}  r, [r10, 0]
+ */
+static u8 adjust_mem_access(u8 *buf, s16 *off, u8 size,
+			    u8 rm, u8 *arc_reg_mem)
+{
+	u8 len = 0;
+	*arc_reg_mem = REG_LO(rm);
+
+	if (!IN_S9_RANGE(*off) ||
+	    (size == BPF_DW && !IN_S9_RANGE(*off + 4))) {
+		len += arc_add_i(BUF(buf, len),
+				 REG_LO(JIT_REG_TMP), REG_LO(rm), (u32)(*off));
+		*arc_reg_mem = REG_LO(JIT_REG_TMP);
+		*off = 0;
+	}
+
+	return len;
+}
+
+/* store rs, [rd, off] */
+u8 store_r(u8 *buf, u8 rs, u8 rd, s16 off, u8 size)
+{
+	u8 len, arc_reg_mem;
+
+	len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem);
+
+	if (size == BPF_DW) {
+		len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem,
+				off, ZZ_4_byte);
+		len += arc_st_r(BUF(buf, len), REG_HI(rs), arc_reg_mem,
+				off + 4, ZZ_4_byte);
+	} else {
+		u8 zz = bpf_to_arc_size(size);
+
+		len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem,
+				off, zz);
+	}
+
+	return len;
+}
+
+/*
+ * For {8,16,32}-bit stores:
+ *   mov r21, imm
+ *   st  r21, [...]
+ * For 64-bit stores:
+ *   mov r21, imm
+ *   st  r21, [...]
+ *   mov r21, {0,-1}
+ *   st  r21, [...+4]
+ */
+u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size)
+{
+	u8 len, arc_reg_mem;
+	/* REG_LO(JIT_REG_TMP) might be used by "adjust_mem_access()". */
+	const u8 arc_rs = REG_HI(JIT_REG_TMP);
+
+	len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem);
+
+	if (size == BPF_DW) {
+		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
+		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem,
+				off, ZZ_4_byte);
+		imm = (imm >= 0 ? 0 : -1);
+		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
+		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem,
+				off + 4, ZZ_4_byte);
+	} else {
+		u8 zz = bpf_to_arc_size(size);
+
+		len += arc_mov_i(BUF(buf, len), arc_rs, imm);
+		len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem, off, zz);
+	}
+
+	return len;
+}
+
+/*
+ * For the calling convention of a little endian machine, the LO part
+ * must be on top of the stack.
+ */
+static u8 push_r64(u8 *buf, u8 reg)
+{
+	u8 len = 0;
+
+#ifdef __LITTLE_ENDIAN
+	/* BPF_REG_FP is mapped to 32-bit "fp" register. */
+	if (reg != BPF_REG_FP)
+		len += arc_push_r(BUF(buf, len), REG_HI(reg));
+	len += arc_push_r(BUF(buf, len), REG_LO(reg));
+#else
+	len += arc_push_r(BUF(buf, len), REG_LO(reg));
+	if (reg != BPF_REG_FP)
+		len += arc_push_r(BUF(buf, len), REG_HI(reg));
+#endif
+
+	return len;
+}
+
+/* load rd, [rs, off] */
+u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext)
+{
+	u8 len, arc_reg_mem;
+
+	len = adjust_mem_access(buf, &off, size, rs, &arc_reg_mem);
+
+	if (size == BPF_B || size == BPF_H || size == BPF_W) {
+		const u8 zz = bpf_to_arc_size(size);
+
+		/* Use LD.X only if the data size is less than 32-bit. */
+		if (sign_ext && (zz == ZZ_1_byte || zz == ZZ_2_byte)) {
+			len += arc_ldx_r(BUF(buf, len), REG_LO(rd),
+					 arc_reg_mem, off, zz);
+		} else {
+			len += arc_ld_r(BUF(buf, len), REG_LO(rd),
+					arc_reg_mem, off, zz);
+		}
+
+		if (sign_ext) {
+			/* Propagate the sign bit to the higher reg. */
+			len += arc_asri_r(BUF(buf, len),
+					  REG_HI(rd), REG_LO(rd), 31);
+		} else {
+			len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0);
+		}
+	} else if (size == BPF_DW) {
+		/*
+		 * We are about to issue 2 consecutive loads:
+		 *
+		 *   ld rx, [rb, off+0]
+		 *   ld ry, [rb, off+4]
+		 *
+		 * If "rx" and "rb" are the same registers, then the order
+		 * should change to guarantee that "rb" remains intact
+		 * during these 2 operations:
+		 *
+		 *   ld ry, [rb, off+4]
+		 *   ld rx, [rb, off+0]
+		 */
+		if (REG_LO(rd) != arc_reg_mem) {
+			len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem,
+					off, ZZ_4_byte);
+			len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem,
+					off + 4, ZZ_4_byte);
+		} else {
+			len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem,
+					off + 4, ZZ_4_byte);
+			len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem,
+					off, ZZ_4_byte);
+		}
+	}
+
+	return len;
+}
+
+u8 add_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_add_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 add_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	if (IN_U6_RANGE(imm))
+		return arc_addi_r(buf, REG_LO(rd), imm);
+	else
+		return arc_add_i(buf, REG_LO(rd), REG_LO(rd), imm);
+}
+
+u8 add_r64(u8 *buf, u8 rd, u8 rs)
+{
+	u8 len;
+
+	len  = arc_addf_r(buf, REG_LO(rd), REG_LO(rs));
+	len += arc_adc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	return len;
+}
+
+u8 add_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	u8 len;
+
+	if (IN_U6_RANGE(imm)) {
+		len  = arc_addif_r(buf, REG_LO(rd), imm);
+		len += arc_adci_r(BUF(buf, len), REG_HI(rd), 0);
+	} else {
+		len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
+		len += add_r64(BUF(buf, len), rd, JIT_REG_TMP);
+	}
+	return len;
+}
+
+u8 sub_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_sub_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	if (IN_U6_RANGE(imm))
+		return arc_subi_r(buf, REG_LO(rd), imm);
+	else
+		return arc_sub_i(buf, REG_LO(rd), imm);
+}
+
+u8 sub_r64(u8 *buf, u8 rd, u8 rs)
+{
+	u8 len;
+
+	len  = arc_subf_r(buf, REG_LO(rd), REG_LO(rs));
+	len += arc_sbc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	return len;
+}
+
+u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	u8 len;
+
+	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
+	len += sub_r64(BUF(buf, len), rd, JIT_REG_TMP);
+	return len;
+}
+
+static u8 cmp_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_cmp_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 neg_r32(u8 *buf, u8 r)
+{
+	return arc_neg_r(buf, REG_LO(r), REG_LO(r));
+}
+
+/* In a two's complement system, -r is (~r + 1). */
+u8 neg_r64(u8 *buf, u8 r)
+{
+	u8 len;
+
+	len  = arc_not_r(buf, REG_LO(r), REG_LO(r));
+	len += arc_not_r(BUF(buf, len), REG_HI(r), REG_HI(r));
+	len += add_r64_i32(BUF(buf, len), r, 1);
+	return len;
+}
+
+u8 mul_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_mpy_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+}
+
+u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	return arc_mpy_i(buf, REG_LO(rd), REG_LO(rd), imm);
+}
+
+/*
+ * MUL B, C
+ * --------
+ * mpy       t0, B_hi, C_lo
+ * mpy       t1, B_lo, C_hi
+ * mpydu   B_lo, B_lo, C_lo
+ * add     B_hi, B_hi,   t0
+ * add     B_hi, B_hi,   t1
+ */
+u8 mul_r64(u8 *buf, u8 rd, u8 rs)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 t1   = REG_HI(JIT_REG_TMP);
+	const u8 C_lo = REG_LO(rs);
+	const u8 C_hi = REG_HI(rs);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	u8 len;
+
+	len  = arc_mpy_r(buf, t0, B_hi, C_lo);
+	len += arc_mpy_r(BUF(buf, len), t1, B_lo, C_hi);
+	len += arc_mpydu_r(BUF(buf, len), B_lo, C_lo);
+	len += arc_add_r(BUF(buf, len), B_hi, t0);
+	len += arc_add_r(BUF(buf, len), B_hi, t1);
+
+	return len;
+}
+
+/*
+ * MUL B, imm
+ * ----------
+ *
+ *  To get a 64-bit result from a signed 64x32 multiplication:
+ *
+ *         B_hi             B_lo   *
+ *         sign             imm
+ *  -----------------------------
+ *  HI(B_lo*imm)     LO(B_lo*imm)  +
+ *     B_hi*imm                    +
+ *     B_lo*sign
+ *  -----------------------------
+ *        res_hi           res_lo
+ *
+ * mpy     t1, B_lo, sign(imm)
+ * mpy     t0, B_hi, imm
+ * mpydu B_lo, B_lo, imm
+ * add   B_hi, B_hi,  t0
+ * add   B_hi, B_hi,  t1
+ *
+ * Note: We can't use signed double multiplication, "mpyd", instead of an
+ * unsigned version, "mpydu", and then get rid of the sign adjustments
+ * calculated in "t1". The signed multiplication, "mpyd", will consider
+ * both operands, "B_lo" and "imm", as signed inputs. However, for this
+ * 64x32 multiplication, "B_lo" must be treated as an unsigned number.
+ */
+u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 t1   = REG_HI(JIT_REG_TMP);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	u8 len = 0;
+
+	if (imm == 1)
+		return 0;
+
+	/* Is the sign-extension of the immediate "-1"? */
+	if (imm < 0)
+		len += arc_neg_r(BUF(buf, len), t1, B_lo);
+
+	len += arc_mpy_i(BUF(buf, len), t0, B_hi, imm);
+	len += arc_mpydu_i(BUF(buf, len), B_lo, imm);
+	len += arc_add_r(BUF(buf, len), B_hi, t0);
+
+	/* Add the "sign*B_lo" part, if necessary. */
+	if (imm < 0)
+		len += arc_add_r(BUF(buf, len), B_hi, t1);
+
+	return len;
+}
+
+u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext)
+{
+	if (sign_ext)
+		return arc_divs_r(buf, REG_LO(rd), REG_LO(rs));
+	else
+		return arc_divu_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext)
+{
+	if (imm == 0)
+		return 0;
+
+	if (sign_ext)
+		return arc_divs_i(buf, REG_LO(rd), imm);
+	else
+		return arc_divu_i(buf, REG_LO(rd), imm);
+}
+
+u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext)
+{
+	if (sign_ext)
+		return arc_rems_r(buf, REG_LO(rd), REG_LO(rs));
+	else
+		return arc_remu_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext)
+{
+	if (imm == 0)
+		return 0;
+
+	if (sign_ext)
+		return arc_rems_i(buf, REG_LO(rd), imm);
+	else
+		return arc_remu_i(buf, REG_LO(rd), imm);
+}
+
+u8 and_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_and_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 and_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	return arc_and_i(buf, REG_LO(rd), imm);
+}
+
+u8 and_r64(u8 *buf, u8 rd, u8 rs)
+{
+	u8 len;
+
+	len  = arc_and_r(buf, REG_LO(rd), REG_LO(rs));
+	len += arc_and_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	return len;
+}
+
+u8 and_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	u8 len;
+
+	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
+	len += and_r64(BUF(buf, len), rd, JIT_REG_TMP);
+	return len;
+}
+
+static u8 tst_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_tst_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 or_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+}
+
+u8 or_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	return arc_or_i(buf, REG_LO(rd), imm);
+}
+
+u8 or_r64(u8 *buf, u8 rd, u8 rs)
+{
+	u8 len;
+
+	len  = arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+	len += arc_or_r(BUF(buf, len), REG_HI(rd), REG_HI(rd), REG_HI(rs));
+	return len;
+}
+
+u8 or_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	u8 len;
+
+	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
+	len += or_r64(BUF(buf, len), rd, JIT_REG_TMP);
+	return len;
+}
+
+u8 xor_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_xor_r(buf, REG_LO(rd), REG_LO(rs));
+}
+
+u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm)
+{
+	return arc_xor_i(buf, REG_LO(rd), imm);
+}
+
+u8 xor_r64(u8 *buf, u8 rd, u8 rs)
+{
+	u8 len;
+
+	len  = arc_xor_r(buf, REG_LO(rd), REG_LO(rs));
+	len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	return len;
+}
+
+u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	u8 len;
+
+	len  = mov_r64_i32(buf, JIT_REG_TMP, imm);
+	len += xor_r64(BUF(buf, len), rd, JIT_REG_TMP);
+	return len;
+}
+
+/* "asl a,b,c" --> "a = (b << (c & 31))". */
+u8 lsh_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_asl_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+}
+
+u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm)
+{
+	return arc_asli_r(buf, REG_LO(rd), REG_LO(rd), imm);
+}
+
+/*
+ * algorithm
+ * ---------
+ * if (n <= 32)
+ *   to_hi = lo >> (32-n)   # (32-n) is the negate of "n" in a 5-bit width.
+ *   lo <<= n
+ *   hi <<= n
+ *   hi |= to_hi
+ * else
+ *   hi = lo << (n-32)
+ *   lo = 0
+ *
+ * assembly translation for "LSH B, C"
+ * (heavily influenced by ARC gcc)
+ * -----------------------------------
+ * not    t0, C_lo            # The first 3 lines are almost the same as:
+ * lsr    t1, B_lo, 1         #   neg   t0, C_lo
+ * lsr    t1, t1, t0          #   lsr   t1, B_lo, t0   --> t1 is "to_hi"
+ * mov    t0, C_lo*           # with one important difference. In "neg"
+ * asl    B_lo, B_lo, t0      # version, when C_lo=0, t1 becomes B_lo while
+ * asl    B_hi, B_hi, t0      # it should be 0. The "not" approach instead,
+ * or     B_hi, B_hi, t1      # "shift"s t1 once and 31 times, practically
+ * btst   t0, 5               # setting it to 0 when C_lo=0.
+ * mov.ne B_hi, B_lo**
+ * mov.ne B_lo, 0
+ *
+ * *The "mov t0, C_lo" is necessary to cover the cases that C is the same
+ * register as B.
+ *
+ * **ARC performs a shift in this manner: B <<= (C & 31)
+ * For 32<=n<64, "n-32" and "n&31" are the same. Therefore, "B << n" and
+ * "B << (n-32)" yield the same results. e.g. the results of "B << 35" and
+ * "B << 3" are the same.
+ *
+ * The behaviour is undefined for n >= 64.
+ */
+u8 lsh_r64(u8 *buf, u8 rd, u8 rs)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 t1   = REG_HI(JIT_REG_TMP);
+	const u8 C_lo = REG_LO(rs);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	u8 len;
+
+	len  = arc_not_r(buf, t0, C_lo);
+	len += arc_lsri_r(BUF(buf, len), t1, B_lo, 1);
+	len += arc_lsr_r(BUF(buf, len), t1, t1, t0);
+	len += arc_mov_r(BUF(buf, len), t0, C_lo);
+	len += arc_asl_r(BUF(buf, len), B_lo, B_lo, t0);
+	len += arc_asl_r(BUF(buf, len), B_hi, B_hi, t0);
+	len += arc_or_r(BUF(buf, len), B_hi, B_hi, t1);
+	len += arc_btst_i(BUF(buf, len), t0, 5);
+	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, B_lo);
+	len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_lo, 0);
+
+	return len;
+}
+
+/*
+ * if (n < 32)
+ *   to_hi = B_lo >> 32-n          # extract upper n bits
+ *   lo <<= n
+ *   hi <<=n
+ *   hi |= to_hi
+ * else if (n < 64)
+ *   hi = lo << n-32
+ *   lo = 0
+ */
+u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	const u8 n    = (u8)imm;
+	u8 len = 0;
+
+	if (n == 0) {
+		return 0;
+	} else if (n <= 31) {
+		len  = arc_lsri_r(buf, t0, B_lo, 32 - n);
+		len += arc_asli_r(BUF(buf, len), B_lo, B_lo, n);
+		len += arc_asli_r(BUF(buf, len), B_hi, B_hi, n);
+		len += arc_or_r(BUF(buf, len), B_hi, B_hi, t0);
+	} else if (n <= 63) {
+		len  = arc_asli_r(buf, B_hi, B_lo, n - 32);
+		len += arc_movi_r(BUF(buf, len), B_lo, 0);
+	}
+	/* n >= 64 is undefined behaviour. */
+
+	return len;
+}
+
+/* "lsr a,b,c" --> "a = (b >> (c & 31))". */
+u8 rsh_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_lsr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+}
+
+u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm)
+{
+	return arc_lsri_r(buf, REG_LO(rd), REG_LO(rd), imm);
+}
+
+/*
+ * For better commentary, see lsh_r64().
+ *
+ * algorithm
+ * ---------
+ * if (n <= 32)
+ *   to_lo = hi << (32-n)
+ *   hi >>= n
+ *   lo >>= n
+ *   lo |= to_lo
+ * else
+ *   lo = hi >> (n-32)
+ *   hi = 0
+ *
+ * RSH    B,C
+ * ----------
+ * not    t0, C_lo
+ * asl    t1, B_hi, 1
+ * asl    t1, t1, t0
+ * mov    t0, C_lo
+ * lsr    B_hi, B_hi, t0
+ * lsr    B_lo, B_lo, t0
+ * or     B_lo, B_lo, t1
+ * btst   t0, 5
+ * mov.ne B_lo, B_hi
+ * mov.ne B_hi, 0
+ */
+u8 rsh_r64(u8 *buf, u8 rd, u8 rs)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 t1   = REG_HI(JIT_REG_TMP);
+	const u8 C_lo = REG_LO(rs);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	u8 len;
+
+	len  = arc_not_r(buf, t0, C_lo);
+	len += arc_asli_r(BUF(buf, len), t1, B_hi, 1);
+	len += arc_asl_r(BUF(buf, len), t1, t1, t0);
+	len += arc_mov_r(BUF(buf, len), t0, C_lo);
+	len += arc_lsr_r(BUF(buf, len), B_hi, B_hi, t0);
+	len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0);
+	len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1);
+	len += arc_btst_i(BUF(buf, len), t0, 5);
+	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi);
+	len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_hi, 0);
+
+	return len;
+}
+
+/*
+ * if (n < 32)
+ *   to_lo = B_lo << 32-n     # extract lower n bits, right-padded with 32-n 0s
+ *   lo >>=n
+ *   hi >>=n
+ *   hi |= to_lo
+ * else if (n < 64)
+ *   lo = hi >> n-32
+ *   hi = 0
+ */
+u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	const u8 n    = (u8)imm;
+	u8 len = 0;
+
+	if (n == 0) {
+		return 0;
+	} else if (n <= 31) {
+		len  = arc_asli_r(buf, t0, B_hi, 32 - n);
+		len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n);
+		len += arc_lsri_r(BUF(buf, len), B_hi, B_hi, n);
+		len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0);
+	} else if (n <= 63) {
+		len  = arc_lsri_r(buf, B_lo, B_hi, n - 32);
+		len += arc_movi_r(BUF(buf, len), B_hi, 0);
+	}
+	/* n >= 64 is undefined behaviour. */
+
+	return len;
+}
+
+/* "asr a,b,c" --> "a = (b s>> (c & 31))". */
+u8 arsh_r32(u8 *buf, u8 rd, u8 rs)
+{
+	return arc_asr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs));
+}
+
+u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm)
+{
+	return arc_asri_r(buf, REG_LO(rd), REG_LO(rd), imm);
+}
+
+/*
+ * For comparison, see rsh_r64().
+ *
+ * algorithm
+ * ---------
+ * if (n <= 32)
+ *   to_lo = hi << (32-n)
+ *   hi s>>= n
+ *   lo  >>= n
+ *   lo |= to_lo
+ * else
+ *   hi_sign = hi s>>31
+ *   lo = hi s>> (n-32)
+ *   hi = hi_sign
+ *
+ * ARSH   B,C
+ * ----------
+ * not    t0, C_lo
+ * asl    t1, B_hi, 1
+ * asl    t1, t1, t0
+ * mov    t0, C_lo
+ * asr    B_hi, B_hi, t0
+ * lsr    B_lo, B_lo, t0
+ * or     B_lo, B_lo, t1
+ * btst   t0, 5
+ * asr    t0, B_hi, 31        # now, t0 = 0 or -1 based on B_hi's sign
+ * mov.ne B_lo, B_hi
+ * mov.ne B_hi, t0
+ */
+u8 arsh_r64(u8 *buf, u8 rd, u8 rs)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 t1   = REG_HI(JIT_REG_TMP);
+	const u8 C_lo = REG_LO(rs);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	u8 len;
+
+	len  = arc_not_r(buf, t0, C_lo);
+	len += arc_asli_r(BUF(buf, len), t1, B_hi, 1);
+	len += arc_asl_r(BUF(buf, len), t1, t1, t0);
+	len += arc_mov_r(BUF(buf, len), t0, C_lo);
+	len += arc_asr_r(BUF(buf, len), B_hi, B_hi, t0);
+	len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0);
+	len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1);
+	len += arc_btst_i(BUF(buf, len), t0, 5);
+	len += arc_asri_r(BUF(buf, len), t0, B_hi, 31);
+	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi);
+	len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, t0);
+
+	return len;
+}
+
+/*
+ * if (n < 32)
+ *   to_lo = lo << 32-n     # extract lower n bits, right-padded with 32-n 0s
+ *   lo >>=n
+ *   hi s>>=n
+ *   hi |= to_lo
+ * else if (n < 64)
+ *   lo = hi s>> n-32
+ *   hi = (lo[msb] ? -1 : 0)
+ */
+u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm)
+{
+	const u8 t0   = REG_LO(JIT_REG_TMP);
+	const u8 B_lo = REG_LO(rd);
+	const u8 B_hi = REG_HI(rd);
+	const u8 n    = (u8)imm;
+	u8 len = 0;
+
+	if (n == 0) {
+		return 0;
+	} else if (n <= 31) {
+		len  = arc_asli_r(buf, t0, B_hi, 32 - n);
+		len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n);
+		len += arc_asri_r(BUF(buf, len), B_hi, B_hi, n);
+		len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0);
+	} else if (n <= 63) {
+		len  = arc_asri_r(buf, B_lo, B_hi, n - 32);
+		len += arc_movi_r(BUF(buf, len), B_hi, -1);
+		len += arc_btst_i(BUF(buf, len), B_lo, 31);
+		len += arc_movu_cc_r(BUF(buf, len), CC_equal, B_hi, 0);
+	}
+	/* n >= 64 is undefined behaviour. */
+
+	return len;
+}
+
+u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext)
+{
+	u8 len = 0;
+#ifdef __BIG_ENDIAN
+	const u8 host_endian = BPF_FROM_BE;
+#else
+	const u8 host_endian = BPF_FROM_LE;
+#endif
+	if (host_endian != endian || force) {
+		switch (size) {
+		case 16:
+			/*
+			 * r = B4B3_B2B1 << 16 --> r = B2B1_0000
+			 * then, swape(r) would become the desired 0000_B1B2
+			 */
+			len = arc_asli_r(buf, REG_LO(rd), REG_LO(rd), 16);
+			fallthrough;
+		case 32:
+			len += arc_swape_r(BUF(buf, len), REG_LO(rd));
+			if (do_zext)
+				len += zext(BUF(buf, len), rd);
+			break;
+		case 64:
+			/*
+			 * swap "hi" and "lo":
+			 *   hi ^= lo;
+			 *   lo ^= hi;
+			 *   hi ^= lo;
+			 * and then swap the bytes in "hi" and "lo".
+			 */
+			len  = arc_xor_r(buf, REG_HI(rd), REG_LO(rd));
+			len += arc_xor_r(BUF(buf, len), REG_LO(rd), REG_HI(rd));
+			len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_LO(rd));
+			len += arc_swape_r(BUF(buf, len), REG_LO(rd));
+			len += arc_swape_r(BUF(buf, len), REG_HI(rd));
+			break;
+		default:
+			/* The caller must have handled this. */
+		}
+	} else {
+		/*
+		 * If the same endianness, there's not much to do other
+		 * than zeroing out the upper bytes based on the "size".
+		 */
+		switch (size) {
+		case 16:
+			len = arc_and_i(buf, REG_LO(rd), 0xffff);
+			fallthrough;
+		case 32:
+			if (do_zext)
+				len += zext(BUF(buf, len), rd);
+			break;
+		case 64:
+			break;
+		default:
+			/* The caller must have handled this. */
+		}
+	}
+
+	return len;
+}
+
+/*
+ * To create a frame, all that is needed is:
+ *
+ *  push fp
+ *  mov  fp, sp
+ *  sub  sp, <frame_size>
+ *
+ * "push fp" is taken care of separately while saving the clobbered registers.
+ * All that remains is copying SP value to FP and shrinking SP's address space
+ * for any possible function call to come.
+ */
+static inline u8 frame_create(u8 *buf, u16 size)
+{
+	u8 len;
+
+	len = arc_mov_r(buf, ARC_R_FP, ARC_R_SP);
+	if (IN_U6_RANGE(size))
+		len += arc_subi_r(BUF(buf, len), ARC_R_SP, size);
+	else
+		len += arc_sub_i(BUF(buf, len), ARC_R_SP, size);
+	return len;
+}
+
+/*
+ * mov sp, fp
+ *
+ * The value of SP upon entering was copied to FP.
+ */
+static inline u8 frame_restore(u8 *buf)
+{
+	return arc_mov_r(buf, ARC_R_SP, ARC_R_FP);
+}
+
+/*
+ * Going from a JITed code to the native caller:
+ *
+ * mov ARC_ABI_RET_lo, BPF_REG_0_lo     # r0 <- r8
+ * mov ARC_ABI_RET_hi, BPF_REG_0_hi     # r1 <- r9
+ */
+static u8 bpf_to_arc_return(u8 *buf)
+{
+	u8 len;
+
+	len  = arc_mov_r(buf, ARC_R_0, REG_LO(BPF_REG_0));
+	len += arc_mov_r(BUF(buf, len), ARC_R_1, REG_HI(BPF_REG_0));
+	return len;
+}
+
+/*
+ * Coming back from an external (in-kernel) function to the JITed code:
+ *
+ * mov ARC_ABI_RET_lo, BPF_REG_0_lo     # r8 <- r0
+ * mov ARC_ABI_RET_hi, BPF_REG_0_hi     # r9 <- r1
+ */
+u8 arc_to_bpf_return(u8 *buf)
+{
+	u8 len;
+
+	len  = arc_mov_r(buf, REG_LO(BPF_REG_0), ARC_R_0);
+	len += arc_mov_r(BUF(buf, len), REG_HI(BPF_REG_0), ARC_R_1);
+	return len;
+}
+
+/*
+ * This translation leads to:
+ *
+ *   mov r10, addr                # always an 8-byte instruction
+ *   jl  [r10]
+ *
+ * The length of the "mov" must be fixed (8), otherwise it may diverge
+ * during the normal and extra passes:
+ *
+ *          normal pass                  extra pass
+ *
+ *   180:  mov     r10,0        |   180:  mov     r10,0x700578d8
+ *   184:  jl      [r10]        |   188:  jl      [r10]
+ *   188:  add.f   r16,r16,0x1  |   18c:  adc     r17,r17,0
+ *   18c:  adc     r17,r17,0    |
+ *
+ * In the above example, the change from "r10 <- 0" to "r10 <- 0x700578d8"
+ * has led to an increase in the length of the "mov" instruction.
+ * Inadvertently, that caused the loss of the "add.f" instruction.
+ */
+static u8 jump_and_link(u8 *buf, u32 addr)
+{
+	u8 len;
+
+	len  = arc_mov_i_fixed(buf, REG_LO(JIT_REG_TMP), addr);
+	len += arc_jl(BUF(buf, len), REG_LO(JIT_REG_TMP));
+	return len;
+}
+
+/*
+ * This function determines which ARC registers must be saved and restored.
+ * It does so by looking into:
+ *
+ * "bpf_reg": The clobbered (destination) BPF register
+ * "is_call": Indicator if the current instruction is a call
+ *
+ * When a register of interest is clobbered, its corresponding bit position
+ * in return value, "usage", is set to true.
+ */
+u32 mask_for_used_regs(u8 bpf_reg, bool is_call)
+{
+	u32 usage = 0;
+
+	/* BPF registers that must be saved. */
+	if (bpf_reg >= BPF_REG_6 && bpf_reg <= BPF_REG_9) {
+		usage |= BIT(REG_LO(bpf_reg));
+		usage |= BIT(REG_HI(bpf_reg));
+	/*
+	 * Using the frame pointer register implies that it should
+	 * be saved and reinitialised with the current frame data.
+	 */
+	} else if (bpf_reg == BPF_REG_FP) {
+		usage |= BIT(REG_LO(BPF_REG_FP));
+	/* Could there be some ARC registers that must to be saved? */
+	} else {
+		if (REG_LO(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST &&
+		    REG_LO(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST)
+			usage |= BIT(REG_LO(bpf_reg));
+
+		if (REG_HI(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST &&
+		    REG_HI(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST)
+			usage |= BIT(REG_HI(bpf_reg));
+	}
+
+	/* A "call" indicates that ARC's "blink" reg must be saved. */
+	usage |= is_call ? BIT(ARC_R_BLINK) : 0;
+
+	return usage;
+}
+
+/*
+ * push blink             # if blink is marked as clobbered
+ * push r[0-n]            # if r[i] is marked as clobbered
+ * push fp                # if fp is marked as clobbered
+ * mov  fp, sp            # if frame_size > 0 (clobbers fp)
+ * sub  sp, <frame_size>  # same as above
+ */
+u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size)
+{
+	u8 len = 0;
+	u32 gp_regs = 0;
+
+	/* Deal with blink first. */
+	if (usage & BIT(ARC_R_BLINK))
+		len += arc_push_r(BUF(buf, len), ARC_R_BLINK);
+
+	gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP));
+	while (gp_regs) {
+		u8 reg = __builtin_ffs(gp_regs) - 1;
+
+		len += arc_push_r(BUF(buf, len), reg);
+		gp_regs &= ~BIT(reg);
+	}
+
+	/* Deal with fp last. */
+	if ((usage & BIT(ARC_R_FP)) || frame_size > 0)
+		len += arc_push_r(BUF(buf, len), ARC_R_FP);
+
+	if (frame_size > 0)
+		len += frame_create(BUF(buf, len), frame_size);
+
+#ifdef ARC_BPF_JIT_DEBUG
+	if ((usage & BIT(ARC_R_FP)) && frame_size == 0) {
+		pr_err("FP is being saved while there is no frame.");
+		BUG();
+	}
+#endif
+
+	return len;
+}
+
+/*
+ * mov  sp, fp            # if frame_size > 0
+ * pop  fp                # if fp is marked as clobbered
+ * pop  r[n-0]            # if r[i] is marked as clobbered
+ * pop  blink             # if blink is marked as clobbered
+ * mov  r0, r8            # always: ABI_return <- BPF_return
+ * mov  r1, r9            # continuation of above
+ * j    [blink]           # always
+ *
+ * "fp being marked as clobbered" and "frame_size > 0" are the two sides of
+ * the same coin.
+ */
+u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size)
+{
+	u32 len = 0;
+	u32 gp_regs = 0;
+
+#ifdef ARC_BPF_JIT_DEBUG
+	if ((usage & BIT(ARC_R_FP)) && frame_size == 0) {
+		pr_err("FP is being saved while there is no frame.");
+		BUG();
+	}
+#endif
+
+	if (frame_size > 0)
+		len += frame_restore(BUF(buf, len));
+
+	/* Deal with fp first. */
+	if ((usage & BIT(ARC_R_FP)) || frame_size > 0)
+		len += arc_pop_r(BUF(buf, len), ARC_R_FP);
+
+	gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP));
+	while (gp_regs) {
+		/* "usage" is 32-bit, each bit indicating an ARC register. */
+		u8 reg = 31 - __builtin_clz(gp_regs);
+
+		len += arc_pop_r(BUF(buf, len), reg);
+		gp_regs &= ~BIT(reg);
+	}
+
+	/* Deal with blink last. */
+	if (usage & BIT(ARC_R_BLINK))
+		len += arc_pop_r(BUF(buf, len), ARC_R_BLINK);
+
+	/* Wrap up the return value and jump back to the caller. */
+	len += bpf_to_arc_return(BUF(buf, len));
+	len += arc_jmp_return(BUF(buf, len));
+
+	return len;
+}
+
+/*
+ * For details on the algorithm, see the comments of "gen_jcc_64()".
+ *
+ * This data structure is holding information for jump translations.
+ *
+ * jit_off: How many bytes into the current JIT address, "b"ranch insn. occurs
+ * cond: The condition that the ARC branch instruction must use
+ *
+ * e.g.:
+ *
+ * BPF_JGE  R1, R0, @target
+ * ------------------------
+ *            |
+ *            v
+ * 0x1000: cmp  r3, r1     # 0x1000 is the JIT address for "BPF_JGE ..." insn
+ * 0x1004: bhi  @target    # first jump (branch higher)
+ * 0x1008: blo  @end       # second jump acting as a skip (end is 0x1014)
+ * 0x100C: cmp  r2, r0     # the lower 32 bits are evaluated
+ * 0x1010: bhs  @target    # third jump (branch higher or same)
+ * 0x1014: ...
+ *
+ * The jit_off(set) of the "bhi" is 4 bytes.
+ * The cond(ition) for the "bhi" is "CC_great_u".
+ *
+ * The jit_off(set) is necessary for calculating the exact displacement
+ * to the "target" address:
+ *
+ * jit_address + jit_off(set) - @target
+ * 0x1000      + 4            - @target
+ */
+#define JCC64_NR_OF_JMPS 3	/* Number of jumps in jcc64 template. */
+#define JCC64_INSNS_TO_END 3	/* Number of insn. inclusive the 2nd jmp to end. */
+#define JCC64_SKIP_JMP 1	/* Index of the "skip" jump to "end". */
+const struct {
+	/*
+	 * "jit_off" is common between all "jmp[]" and is coupled with
+	 * "cond" of each "jmp[]" instance. e.g.:
+	 *
+	 * arcv2_64_jccs.jit_off[1]
+	 * arcv2_64_jccs.jmp[ARC_CC_UGT].cond[1]
+	 *
+	 * Are indicating that the second jump in JITed code of "UGT"
+	 * is at offset "jit_off[1]" while its condition is "cond[1]".
+	 */
+	u8 jit_off[JCC64_NR_OF_JMPS];
+
+	struct {
+		u8 cond[JCC64_NR_OF_JMPS];
+	} jmp[ARC_CC_SLE + 1];
+} arcv2_64_jccs = {
+	.jit_off = {
+		INSN_len_normal * 1,
+		INSN_len_normal * 2,
+		INSN_len_normal * 4
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   bhi  @target         # 1: u>
+	 *   blo  @end            # 2: u<
+	 *   cmp  rd_lo, rs_lo
+	 *   bhi  @target         # 3: u>
+	 * end:
+	 */
+	.jmp[ARC_CC_UGT] = {
+		.cond = {CC_great_u, CC_less_u, CC_great_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   bhi  @target         # 1: u>
+	 *   blo  @end            # 2: u<
+	 *   cmp  rd_lo, rs_lo
+	 *   bhs  @target         # 3: u>=
+	 * end:
+	 */
+	.jmp[ARC_CC_UGE] = {
+		.cond = {CC_great_u, CC_less_u, CC_great_eq_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   blo  @target         # 1: u<
+	 *   bhi  @end            # 2: u>
+	 *   cmp  rd_lo, rs_lo
+	 *   blo  @target         # 3: u<
+	 * end:
+	 */
+	.jmp[ARC_CC_ULT] = {
+		.cond = {CC_less_u, CC_great_u, CC_less_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   blo  @target         # 1: u<
+	 *   bhi  @end            # 2: u>
+	 *   cmp  rd_lo, rs_lo
+	 *   bls  @target         # 3: u<=
+	 * end:
+	 */
+	.jmp[ARC_CC_ULE] = {
+		.cond = {CC_less_u, CC_great_u, CC_less_eq_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   bgt  @target         # 1: s>
+	 *   blt  @end            # 2: s<
+	 *   cmp  rd_lo, rs_lo
+	 *   bhi  @target         # 3: u>
+	 * end:
+	 */
+	.jmp[ARC_CC_SGT] = {
+		.cond = {CC_great_s, CC_less_s, CC_great_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   bgt  @target         # 1: s>
+	 *   blt  @end            # 2: s<
+	 *   cmp  rd_lo, rs_lo
+	 *   bhs  @target         # 3: u>=
+	 * end:
+	 */
+	.jmp[ARC_CC_SGE] = {
+		.cond = {CC_great_s, CC_less_s, CC_great_eq_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   blt  @target         # 1: s<
+	 *   bgt  @end            # 2: s>
+	 *   cmp  rd_lo, rs_lo
+	 *   blo  @target         # 3: u<
+	 * end:
+	 */
+	.jmp[ARC_CC_SLT] = {
+		.cond = {CC_less_s, CC_great_s, CC_less_u}
+	},
+	/*
+	 *   cmp  rd_hi, rs_hi
+	 *   blt  @target         # 1: s<
+	 *   bgt  @end            # 2: s>
+	 *   cmp  rd_lo, rs_lo
+	 *   bls  @target         # 3: u<=
+	 * end:
+	 */
+	.jmp[ARC_CC_SLE] = {
+		.cond = {CC_less_s, CC_great_s, CC_less_eq_u}
+	}
+};
+
+/*
+ * The displacement (offset) for ARC's "b"ranch instruction is the distance
+ * from the aligned version of _current_ instruction (PCL) to the target
+ * instruction:
+ *
+ * DISP = TARGET - PCL          # PCL is the word aligned PC
+ */
+static inline s32 get_displacement(u32 curr_off, u32 targ_off)
+{
+	return (s32)(targ_off - (curr_off & ~3L));
+}
+
+/*
+ * "disp"lacement should be:
+ *
+ * 1. 16-bit aligned.
+ * 2. fit in S25, because no "condition code" is supposed to be encoded.
+ */
+static inline bool is_valid_far_disp(s32 disp)
+{
+	return (!(disp & 1) && IN_S25_RANGE(disp));
+}
+
+/*
+ * "disp"lacement should be:
+ *
+ * 1. 16-bit aligned.
+ * 2. fit in S21, because "condition code" is supposed to be encoded too.
+ */
+static inline bool is_valid_near_disp(s32 disp)
+{
+	return (!(disp & 1) && IN_S21_RANGE(disp));
+}
+
+/*
+ * cmp        rd_hi, rs_hi
+ * cmp.z      rd_lo, rs_lo
+ * b{eq,ne}   @target
+ *   |  |
+ *   |  `-->  "eq" param is false (JNE)
+ *   `----->  "eq" param is true  (JEQ)
+ */
+static int gen_j_eq_64(u8 *buf, u8 rd, u8 rs, bool eq,
+		       u32 curr_off, u32 targ_off)
+{
+	s32 disp;
+	u8 len = 0;
+
+	len += arc_cmp_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	len += arc_cmpz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
+	disp = get_displacement(curr_off + len, targ_off);
+	len += arc_bcc(BUF(buf, len), eq ? CC_equal : CC_unequal, disp);
+
+	return len;
+}
+
+/*
+ * tst   rd_hi, rs_hi
+ * tst.z rd_lo, rs_lo
+ * bne   @target
+ */
+static u8 gen_jset_64(u8 *buf, u8 rd, u8 rs, u32 curr_off, u32 targ_off)
+{
+	u8 len = 0;
+	s32 disp;
+
+	len += arc_tst_r(BUF(buf, len), REG_HI(rd), REG_HI(rs));
+	len += arc_tstz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
+	disp = get_displacement(curr_off + len, targ_off);
+	len += arc_bcc(BUF(buf, len), CC_unequal, disp);
+
+	return len;
+}
+
+/*
+ * Verify if all the jumps for a JITed jcc64 operation are valid,
+ * by consulting the data stored at "arcv2_64_jccs".
+ */
+static bool check_jcc_64(u32 curr_off, u32 targ_off, u8 cond)
+{
+	size_t i;
+
+	if (cond >= ARC_CC_LAST)
+		return false;
+
+	for (i = 0; i < JCC64_NR_OF_JMPS; i++) {
+		u32 from, to;
+
+		from = curr_off + arcv2_64_jccs.jit_off[i];
+		/* for the 2nd jump, we jump to the end of block. */
+		if (i != JCC64_SKIP_JMP)
+			to = targ_off;
+		else
+			to = from + (JCC64_INSNS_TO_END * INSN_len_normal);
+		/* There is a "cc" in the instruction, so a "near" jump. */
+		if (!is_valid_near_disp(get_displacement(from, to)))
+			return false;
+	}
+
+	return true;
+}
+
+/* Can the jump from "curr_off" to "targ_off" actually happen? */
+bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond)
+{
+	s32 disp;
+
+	switch (cond) {
+	case ARC_CC_UGT:
+	case ARC_CC_UGE:
+	case ARC_CC_ULT:
+	case ARC_CC_ULE:
+	case ARC_CC_SGT:
+	case ARC_CC_SGE:
+	case ARC_CC_SLT:
+	case ARC_CC_SLE:
+		return check_jcc_64(curr_off, targ_off, cond);
+	case ARC_CC_EQ:
+	case ARC_CC_NE:
+	case ARC_CC_SET:
+		/*
+		 * The "jump" for the JITed BPF_J{SET,EQ,NE} is actually the
+		 * 3rd instruction. See comments of "gen_j{set,_eq}_64()".
+		 */
+		curr_off += 2 * INSN_len_normal;
+		disp = get_displacement(curr_off, targ_off);
+		/* There is a "cc" field in the issued instruction. */
+		return is_valid_near_disp(disp);
+	case ARC_CC_AL:
+		disp = get_displacement(curr_off, targ_off);
+		return is_valid_far_disp(disp);
+	default:
+		return false;
+	}
+}
+
+/*
+ * The template for the 64-bit jumps with the following BPF conditions
+ *
+ * u< u<= u> u>= s< s<= s> s>=
+ *
+ * Looks like below:
+ *
+ *   cmp   rd_hi, rs_hi
+ *   b<c1> @target
+ *   b<c2> @end
+ *   cmp   rd_lo, rs_lo   # if execution reaches here, r{d,s}_hi are equal
+ *   b<c3> @target
+ * end:
+ *
+ * "c1" is the condition that JIT is handling minus the equality part.
+ * For instance if we have to translate an "unsigned greater or equal",
+ * then "c1" will be "unsigned greater". We won't know about equality
+ * until all 64-bits of data (higeher and lower registers) are processed.
+ *
+ * "c2" is the counter logic of "c1". For instance, if "c1" is originated
+ * from "s>", then "c2" would be "s<". Notice that equality doesn't play
+ * a role here either, because the lower 32 bits are not processed yet.
+ *
+ * "c3" is the unsigned version of "c1", no matter if the BPF condition
+ * was signed or unsigned. An unsigned version is necessary, because the
+ * MSB of the lower 32 bits does not reflect a sign in the whole 64-bit
+ * scheme. Otherwise, 64-bit comparisons like
+ * (0x0000_0000,0x8000_0000) s>= (0x0000_0000,0x0000_0000)
+ * would yield an incorrect result. Finally, if there is an equality
+ * check in the BPF condition, it will be reflected in "c3".
+ *
+ * You can find all the instances of this template where the
+ * "arcv2_64_jccs" is getting initialised.
+ */
+static u8 gen_jcc_64(u8 *buf, u8 rd, u8 rs, u8 cond,
+		     u32 curr_off, u32 targ_off)
+{
+	s32 disp;
+	u32 end_off;
+	const u8 *cc = arcv2_64_jccs.jmp[cond].cond;
+	u8 len = 0;
+
+	/* cmp rd_hi, rs_hi */
+	len += arc_cmp_r(buf, REG_HI(rd), REG_HI(rs));
+
+	/* b<c1> @target */
+	disp = get_displacement(curr_off + len, targ_off);
+	len += arc_bcc(BUF(buf, len), cc[0], disp);
+
+	/* b<c2> @end */
+	end_off = curr_off + len + (JCC64_INSNS_TO_END * INSN_len_normal);
+	disp = get_displacement(curr_off + len, end_off);
+	len += arc_bcc(BUF(buf, len), cc[1], disp);
+
+	/* cmp rd_lo, rs_lo */
+	len += arc_cmp_r(BUF(buf, len), REG_LO(rd), REG_LO(rs));
+
+	/* b<c3> @target */
+	disp = get_displacement(curr_off + len, targ_off);
+	len += arc_bcc(BUF(buf, len), cc[2], disp);
+
+	return len;
+}
+
+/*
+ * This function only applies the necessary logic to make the proper
+ * translations. All the sanity checks must have already been done
+ * by calling the check_jmp_64().
+ */
+u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off)
+{
+	u8 len = 0;
+	bool eq = false;
+	s32 disp;
+
+	switch (cond) {
+	case ARC_CC_AL:
+		disp = get_displacement(curr_off, targ_off);
+		len = arc_b(buf, disp);
+		break;
+	case ARC_CC_UGT:
+	case ARC_CC_UGE:
+	case ARC_CC_ULT:
+	case ARC_CC_ULE:
+	case ARC_CC_SGT:
+	case ARC_CC_SGE:
+	case ARC_CC_SLT:
+	case ARC_CC_SLE:
+		len = gen_jcc_64(buf, rd, rs, cond, curr_off, targ_off);
+		break;
+	case ARC_CC_EQ:
+		eq = true;
+		fallthrough;
+	case ARC_CC_NE:
+		len = gen_j_eq_64(buf, rd, rs, eq, curr_off, targ_off);
+		break;
+	case ARC_CC_SET:
+		len = gen_jset_64(buf, rd, rs, curr_off, targ_off);
+		break;
+	default:
+#ifdef ARC_BPF_JIT_DEBUG
+		pr_err("64-bit jump condition is not known.");
+		BUG();
+#endif
+	}
+	return len;
+}
+
+/*
+ * The condition codes to use when generating JIT instructions
+ * for 32-bit jumps.
+ *
+ * The "ARC_CC_AL" index is not really used by the code, but it
+ * is here for the sake of completeness.
+ *
+ * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst"
+ * instruction that precedes the conditional branch.
+ */
+const u8 arcv2_32_jmps[ARC_CC_LAST] = {
+	[ARC_CC_UGT] = CC_great_u,
+	[ARC_CC_UGE] = CC_great_eq_u,
+	[ARC_CC_ULT] = CC_less_u,
+	[ARC_CC_ULE] = CC_less_eq_u,
+	[ARC_CC_SGT] = CC_great_s,
+	[ARC_CC_SGE] = CC_great_eq_s,
+	[ARC_CC_SLT] = CC_less_s,
+	[ARC_CC_SLE] = CC_less_eq_s,
+	[ARC_CC_AL]  = CC_always,
+	[ARC_CC_EQ]  = CC_equal,
+	[ARC_CC_NE]  = CC_unequal,
+	[ARC_CC_SET] = CC_unequal
+};
+
+/* Can the jump from "curr_off" to "targ_off" actually happen? */
+bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond)
+{
+	u8 addendum;
+	s32 disp;
+
+	if (cond >= ARC_CC_LAST)
+		return false;
+
+	/*
+	 * The unconditional jump happens immediately, while the rest
+	 * are either preceded by a "cmp" or "tst" instruction.
+	 */
+	addendum = (cond == ARC_CC_AL) ? 0 : INSN_len_normal;
+	disp = get_displacement(curr_off + addendum, targ_off);
+
+	if (ARC_CC_AL)
+		return is_valid_far_disp(disp);
+	else
+		return is_valid_near_disp(disp);
+}
+
+/*
+ * The JITed code for 32-bit (conditional) branches:
+ *
+ * ARC_CC_AL  @target
+ *   b @jit_targ_addr
+ *
+ * ARC_CC_SET rd, rs, @target
+ *   tst rd, rs
+ *   bnz @jit_targ_addr
+ *
+ * ARC_CC_xx  rd, rs, @target
+ *   cmp rd, rs
+ *   b<cc> @jit_targ_addr            # cc = arcv2_32_jmps[xx]
+ */
+u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off)
+{
+	s32 disp;
+	u8 len = 0;
+
+	/*
+	 * Although this must have already been checked by "check_jmp_32()",
+	 * we're not going to risk accessing "arcv2_32_jmps" array without
+	 * the boundary check.
+	 */
+	if (cond >= ARC_CC_LAST) {
+#ifdef ARC_BPF_JIT_DEBUG
+		pr_err("32-bit jump condition is not known.");
+		BUG();
+#endif
+		return 0;
+	}
+
+	/* If there is a "condition", issue the "cmp" or "tst" first. */
+	if (cond != ARC_CC_AL) {
+		if (cond == ARC_CC_SET)
+			len = tst_r32(buf, rd, rs);
+		else
+			len = cmp_r32(buf, rd, rs);
+		/*
+		 * The issued instruction affects the "disp"lacement as
+		 * it alters the "curr_off" by its "len"gth. The "curr_off"
+		 * should always point to the jump instruction.
+		 */
+		disp = get_displacement(curr_off + len, targ_off);
+		len += arc_bcc(BUF(buf, len), arcv2_32_jmps[cond], disp);
+	} else {
+		/* The straight forward unconditional jump. */
+		disp = get_displacement(curr_off, targ_off);
+		len = arc_b(buf, disp);
+	}
+
+	return len;
+}
+
+/*
+ * Generate code for functions calls. There can be two types of calls:
+ *
+ * - Calling another BPF function
+ * - Calling an in-kernel function which is compiled by ARC gcc
+ *
+ * In the later case, we must comply to ARCv2 ABI and handle arguments
+ * and return values accordingly.
+ */
+u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func)
+{
+	u8 len = 0;
+
+	/*
+	 * In case of an in-kernel function call, always push the 5th
+	 * argument onto the stack, because that's where the ABI dictates
+	 * it should be found. If the callee doesn't really use it, no harm
+	 * is done. The stack is readjusted either way after the call.
+	 */
+	if (external_func)
+		len += push_r64(BUF(buf, len), BPF_REG_5);
+
+	len += jump_and_link(BUF(buf, len), func_addr);
+
+	if (external_func)
+		len += arc_add_i(BUF(buf, len), ARC_R_SP, ARC_R_SP, ARG5_SIZE);
+
+	return len;
+}
diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c
new file mode 100644
index 000000000000..6f6b4ffccf2c
--- /dev/null
+++ b/arch/arc/net/bpf_jit_core.c
@@ -0,0 +1,1425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The back-end-agnostic part of Just-In-Time compiler for eBPF bytecode.
+ *
+ * Copyright (c) 2024 Synopsys Inc.
+ * Author: Shahab Vahedi <shahab@synopsys.com>
+ */
+#include <linux/bug.h>
+#include "bpf_jit.h"
+
+/*
+ * Check for the return value. A pattern used often in this file.
+ * There must be a "ret" variable of type "int" in the scope.
+ */
+#define CHECK_RET(cmd)			\
+	do {				\
+		ret = (cmd);		\
+		if (ret < 0)		\
+			return ret;	\
+	} while (0)
+
+#ifdef ARC_BPF_JIT_DEBUG
+/* Dumps bytes in /var/log/messages at KERN_INFO level (4). */
+static void dump_bytes(const u8 *buf, u32 len, const char *header)
+{
+	u8 line[64];
+	size_t i, j;
+
+	pr_info("-----------------[ %s ]-----------------\n", header);
+
+	for (i = 0, j = 0; i < len; i++) {
+		/* Last input byte? */
+		if (i == len - 1) {
+			j += scnprintf(line + j, 64 - j, "0x%02x", buf[i]);
+			pr_info("%s\n", line);
+			break;
+		}
+		/* End of line? */
+		else if (i % 8 == 7) {
+			j += scnprintf(line + j, 64 - j, "0x%02x", buf[i]);
+			pr_info("%s\n", line);
+			j = 0;
+		} else {
+			j += scnprintf(line + j, 64 - j, "0x%02x, ", buf[i]);
+		}
+	}
+}
+#endif /* ARC_BPF_JIT_DEBUG */
+
+/********************* JIT context ***********************/
+
+/*
+ * buf:		Translated instructions end up here.
+ * len:		The length of whole block in bytes.
+ * index:	The offset at which the _next_ instruction may be put.
+ */
+struct jit_buffer {
+	u8	*buf;
+	u32	len;
+	u32	index;
+};
+
+/*
+ * This is a subset of "struct jit_context" that its information is deemed
+ * necessary for the next extra pass to come.
+ *
+ * bpf_header:	Needed to finally lock the region.
+ * bpf2insn:	Used to find the translation for instructions of interest.
+ *
+ * Things like "jit.buf" and "jit.len" can be retrieved respectively from
+ * "prog->bpf_func" and "prog->jited_len".
+ */
+struct arc_jit_data {
+	struct bpf_binary_header *bpf_header;
+	u32                      *bpf2insn;
+};
+
+/*
+ * The JIT pertinent context that is used by different functions.
+ *
+ * prog:		The current eBPF program being handled.
+ * orig_prog:		The original eBPF program before any possible change.
+ * jit:			The JIT buffer and its length.
+ * bpf_header:		The JITed program header. "jit.buf" points inside it.
+ * emit:		If set, opcodes are written to memory; else, a dry-run.
+ * do_zext:		If true, 32-bit sub-regs must be zero extended.
+ * bpf2insn:		Maps BPF insn indices to their counterparts in jit.buf.
+ * bpf2insn_valid:	Indicates if "bpf2ins" is populated with the mappings.
+ * jit_data:		A piece of memory to transfer data to the next pass.
+ * arc_regs_clobbered:	Each bit status determines if that arc reg is clobbered.
+ * save_blink:		Whether ARC's "blink" register needs to be saved.
+ * frame_size:		Derived from "prog->aux->stack_depth".
+ * epilogue_offset:	Used by early "return"s in the code to jump here.
+ * need_extra_pass:	A forecast if an "extra_pass" will occur.
+ * is_extra_pass:	Indicates if the current pass is an extra pass.
+ * user_bpf_prog:	True, if VM opcodes come from a real program.
+ * blinded:		True if "constant blinding" step returned a new "prog".
+ * success:		Indicates if the whole JIT went OK.
+ */
+struct jit_context {
+	struct bpf_prog			*prog;
+	struct bpf_prog			*orig_prog;
+	struct jit_buffer		jit;
+	struct bpf_binary_header	*bpf_header;
+	bool				emit;
+	bool				do_zext;
+	u32				*bpf2insn;
+	bool				bpf2insn_valid;
+	struct arc_jit_data		*jit_data;
+	u32				arc_regs_clobbered;
+	bool				save_blink;
+	u16				frame_size;
+	u32				epilogue_offset;
+	bool				need_extra_pass;
+	bool				is_extra_pass;
+	bool				user_bpf_prog;
+	bool				blinded;
+	bool				success;
+};
+
+/*
+ * If we're in ARC_BPF_JIT_DEBUG mode and the debug level is right, dump the
+ * input BPF stream. "bpf_jit_dump()" is not fully suited for this purpose.
+ */
+static void vm_dump(const struct bpf_prog *prog)
+{
+#ifdef ARC_BPF_JIT_DEBUG
+	if (bpf_jit_enable > 1)
+		dump_bytes((u8 *)prog->insns, 8 * prog->len, " VM  ");
+#endif
+}
+
+/*
+ * If the right level of debug is set, dump the bytes. There are 2 variants
+ * of this function:
+ *
+ * 1. Use the standard bpf_jit_dump() which is meant only for JITed code.
+ * 2. Use the dump_bytes() to match its "vm_dump()" instance.
+ */
+static void jit_dump(const struct jit_context *ctx)
+{
+#ifdef ARC_BPF_JIT_DEBUG
+	u8 header[8];
+#endif
+	const int pass = ctx->is_extra_pass ? 2 : 1;
+
+	if (bpf_jit_enable <= 1 || !ctx->prog->jited)
+		return;
+
+#ifdef ARC_BPF_JIT_DEBUG
+	scnprintf(header, sizeof(header), "JIT:%d", pass);
+	dump_bytes(ctx->jit.buf, ctx->jit.len, header);
+	pr_info("\n");
+#else
+	bpf_jit_dump(ctx->prog->len, ctx->jit.len, pass, ctx->jit.buf);
+#endif
+}
+
+/* Initialise the context so there's no garbage. */
+static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog)
+{
+	memset(ctx, 0, sizeof(ctx));
+
+	ctx->orig_prog = prog;
+
+	/* If constant blinding was requested but failed, scram. */
+	ctx->prog = bpf_jit_blind_constants(prog);
+	if (IS_ERR(ctx->prog))
+		return PTR_ERR(ctx->prog);
+	ctx->blinded = (ctx->prog == ctx->orig_prog ? false : true);
+
+	/* If the verifier doesn't zero-extend, then we have to do it. */
+	ctx->do_zext = !ctx->prog->aux->verifier_zext;
+
+	ctx->is_extra_pass = ctx->prog->jited;
+	ctx->user_bpf_prog = ctx->prog->is_func;
+
+	return 0;
+}
+
+/*
+ * Only after the first iteration of normal pass (the dry-run),
+ * there are valid offsets in ctx->bpf2insn array.
+ */
+static inline bool offsets_available(const struct jit_context *ctx)
+{
+	return ctx->bpf2insn_valid;
+}
+
+/*
+ * "*mem" should be freed when there is no "extra pass" to come,
+ * or the compilation terminated abruptly. A few of such memory
+ * allocations are: ctx->jit_data and ctx->bpf2insn.
+ */
+static inline void maybe_free(struct jit_context *ctx, void **mem)
+{
+	if (*mem) {
+		if (!ctx->success || !ctx->need_extra_pass) {
+			kfree(*mem);
+			*mem = NULL;
+		}
+	}
+}
+
+/*
+ * Free memories based on the status of the context.
+ *
+ * A note about "bpf_header": On successful runs, "bpf_header" is
+ * not freed, because "jit.buf", a sub-array of it, is returned as
+ * the "bpf_func". However, "bpf_header" is lost and nothing points
+ * to it. This should not cause a leakage, because apparently
+ * "bpf_header" can be revived by "bpf_jit_binary_hdr()". This is
+ * how "bpf_jit_free()" in "kernel/bpf/core.c" releases the memory.
+ */
+static void jit_ctx_cleanup(struct jit_context *ctx)
+{
+	if (ctx->blinded) {
+		/* if all went well, release the orig_prog. */
+		if (ctx->success)
+			bpf_jit_prog_release_other(ctx->prog, ctx->orig_prog);
+		else
+			bpf_jit_prog_release_other(ctx->orig_prog, ctx->prog);
+	}
+
+	maybe_free(ctx, (void **)&ctx->bpf2insn);
+	maybe_free(ctx, (void **)&ctx->jit_data);
+
+	if (!ctx->bpf2insn)
+		ctx->bpf2insn_valid = false;
+
+	/* Freeing "bpf_header" is enough. "jit.buf" is a sub-array of it. */
+	if (!ctx->success && ctx->bpf_header) {
+		bpf_jit_binary_free(ctx->bpf_header);
+		ctx->bpf_header = NULL;
+		ctx->jit.buf    = NULL;
+		ctx->jit.index  = 0;
+		ctx->jit.len    = 0;
+	}
+
+	ctx->emit = false;
+	ctx->do_zext = false;
+}
+
+/*
+ * Analyse the register usage and record the frame size.
+ * The register usage is determined by consulting the back-end.
+ */
+static void analyze_reg_usage(struct jit_context *ctx)
+{
+	size_t i;
+	u32 usage = 0;
+	const struct bpf_insn *insn = ctx->prog->insnsi;
+
+	for (i = 0; i < ctx->prog->len; i++) {
+		u8 bpf_reg;
+		bool call;
+
+		bpf_reg = insn[i].dst_reg;
+		call = (insn[i].code == (BPF_JMP | BPF_CALL)) ? true : false;
+		usage |= mask_for_used_regs(bpf_reg, call);
+	}
+
+	ctx->arc_regs_clobbered = usage;
+	ctx->frame_size = ctx->prog->aux->stack_depth;
+}
+
+/* Verify that no instruction will be emitted when there is no buffer. */
+static inline int jit_buffer_check(const struct jit_context *ctx)
+{
+	if (ctx->emit) {
+		if (!ctx->jit.buf) {
+			pr_err("bpf-jit: inconsistence state; no "
+			       "buffer to emit instructions.\n");
+			return -EINVAL;
+		} else if (ctx->jit.index > ctx->jit.len) {
+			pr_err("bpf-jit: estimated JIT length is less "
+			       "than the emitted instructions.\n");
+			return -EFAULT;
+		}
+	}
+	return 0;
+}
+
+/* On a dry-run (emit=false), "jit.len" is growing gradually. */
+static inline void jit_buffer_update(struct jit_context *ctx, u32 n)
+{
+	if (!ctx->emit)
+		ctx->jit.len += n;
+	else
+		ctx->jit.index += n;
+}
+
+/* Based on "emit", determine the address where instructions are emitted. */
+static inline u8 *effective_jit_buf(const struct jit_context *ctx)
+{
+	return ctx->emit ? (ctx->jit.buf + ctx->jit.index) : NULL;
+}
+
+/* Prologue based on context variables set by "analyze_reg_usage()". */
+static int handle_prologue(struct jit_context *ctx)
+{
+	int ret;
+	u8 *buf = effective_jit_buf(ctx);
+	u32 len = 0;
+
+	CHECK_RET(jit_buffer_check(ctx));
+
+	len = arc_prologue(buf, ctx->arc_regs_clobbered, ctx->frame_size);
+	jit_buffer_update(ctx, len);
+
+	return 0;
+}
+
+/* The counter part for "handle_prologue()". */
+static int handle_epilogue(struct jit_context *ctx)
+{
+	int ret;
+	u8 *buf = effective_jit_buf(ctx);
+	u32 len = 0;
+
+	CHECK_RET(jit_buffer_check(ctx));
+
+	len = arc_epilogue(buf, ctx->arc_regs_clobbered, ctx->frame_size);
+	jit_buffer_update(ctx, len);
+
+	return 0;
+}
+
+/* Tell which number of the BPF instruction we are dealing with. */
+static inline s32 get_index_for_insn(const struct jit_context *ctx,
+				     const struct bpf_insn *insn)
+{
+	return (insn - ctx->prog->insnsi);
+}
+
+/*
+ * In most of the cases, the "offset" is read from "insn->off". However,
+ * if it is an unconditional BPF_JMP32, then it comes from "insn->imm".
+ *
+ * (Courtesy of "cpu=v4" support)
+ */
+static inline s32 get_offset(const struct bpf_insn *insn)
+{
+	if ((BPF_CLASS(insn->code) == BPF_JMP32) &&
+	    (BPF_OP(insn->code) == BPF_JA))
+		return insn->imm;
+	else
+		return insn->off;
+}
+
+/*
+ * Determine to which number of the BPF instruction we're jumping to.
+ *
+ * The "offset" is interpreted as the "number" of BPF instructions
+ * from the _next_ BPF instruction. e.g.:
+ *
+ *  4 means 4 instructions after  the next insn
+ *  0 means 0 instructions after  the next insn -> fallthrough.
+ * -1 means 1 instruction  before the next insn -> jmp to current insn.
+ *
+ *  Another way to look at this, "offset" is the number of instructions
+ *  that exist between the current instruction and the target instruction.
+ *
+ *  It is worth noting that a "mov r,i64", which is 16-byte long, is
+ *  treated as two instructions long, therefore "offset" needn't be
+ *  treated specially for those. Everything is uniform.
+ */
+static inline s32 get_target_index_for_insn(const struct jit_context *ctx,
+					    const struct bpf_insn *insn)
+{
+	return (get_index_for_insn(ctx, insn) + 1) + get_offset(insn);
+}
+
+/* Is there an immediate operand encoded in the "insn"? */
+static inline bool has_imm(const struct bpf_insn *insn)
+{
+	return BPF_SRC(insn->code) == BPF_K;
+}
+
+/* Is the last BPF instruction? */
+static inline bool is_last_insn(const struct bpf_prog *prog, u32 idx)
+{
+	return idx == (prog->len - 1);
+}
+
+/*
+ * Invocation of this function, conditionally signals the need for
+ * an extra pass. The conditions that must be met are:
+ *
+ * 1. The current pass itself shouldn't be an extra pass.
+ * 2. The stream of bytes being JITed must come from a user program.
+ */
+static inline void set_need_for_extra_pass(struct jit_context *ctx)
+{
+	if (!ctx->is_extra_pass)
+		ctx->need_extra_pass = ctx->user_bpf_prog;
+}
+
+/*
+ * Check if the "size" is valid and then transfer the control to
+ * the back-end for the swap.
+ */
+static int handle_swap(u8 *buf, u8 rd, u8 size, u8 endian,
+		       bool force, bool do_zext, u8 *len)
+{
+	/* Sanity check on the size. */
+	switch (size) {
+	case 16:
+	case 32:
+	case 64:
+		break;
+	default:
+		pr_err("bpf-jit: invalid size for swap.\n");
+		return -EINVAL;
+	}
+
+	*len = gen_swap(buf, rd, size, endian, force, do_zext);
+
+	return 0;
+}
+
+/* Checks if the (instruction) index is in valid range. */
+static inline bool check_insn_idx_valid(const struct jit_context *ctx,
+					const s32 idx)
+{
+	return (idx >= 0 && idx < ctx->prog->len);
+}
+
+/*
+ * Decouple the back-end from BPF by converting BPF conditions
+ * to internal enum. ARC_CC_* start from 0 and are used as index
+ * to an array. BPF_J* usage must end after this conversion.
+ */
+static int bpf_cond_to_arc(const u8 op, u8 *arc_cc)
+{
+	switch (op) {
+	case BPF_JA:
+		*arc_cc = ARC_CC_AL;
+		break;
+	case BPF_JEQ:
+		*arc_cc = ARC_CC_EQ;
+		break;
+	case BPF_JGT:
+		*arc_cc = ARC_CC_UGT;
+		break;
+	case BPF_JGE:
+		*arc_cc = ARC_CC_UGE;
+		break;
+	case BPF_JSET:
+		*arc_cc = ARC_CC_SET;
+		break;
+	case BPF_JNE:
+		*arc_cc = ARC_CC_NE;
+		break;
+	case BPF_JSGT:
+		*arc_cc = ARC_CC_SGT;
+		break;
+	case BPF_JSGE:
+		*arc_cc = ARC_CC_SGE;
+		break;
+	case BPF_JLT:
+		*arc_cc = ARC_CC_ULT;
+		break;
+	case BPF_JLE:
+		*arc_cc = ARC_CC_ULE;
+		break;
+	case BPF_JSLT:
+		*arc_cc = ARC_CC_SLT;
+		break;
+	case BPF_JSLE:
+		*arc_cc = ARC_CC_SLE;
+		break;
+	default:
+		pr_err("bpf-jit: can't handle condition 0x%02X\n", op);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Check a few things for a supposedly "jump" instruction:
+ *
+ * 0. "insn" is a "jump" instruction, but not the "call/exit" variant.
+ * 1. The current "insn" index is in valid range.
+ * 2. The index of target instruction is in valid range.
+ */
+static int check_bpf_jump(const struct jit_context *ctx,
+			  const struct bpf_insn *insn)
+{
+	const u8 class = BPF_CLASS(insn->code);
+	const u8 op = BPF_OP(insn->code);
+
+	/* Must be a jmp(32) instruction that is not a "call/exit". */
+	if ((class != BPF_JMP && class != BPF_JMP32) ||
+	    (op == BPF_CALL || op == BPF_EXIT)) {
+		pr_err("bpf-jit: not a jump instruction.\n");
+		return -EINVAL;
+	}
+
+	if (!check_insn_idx_valid(ctx, get_index_for_insn(ctx, insn))) {
+		pr_err("bpf-jit: the bpf jump insn is not in prog.\n");
+		return -EINVAL;
+	}
+
+	if (!check_insn_idx_valid(ctx, get_target_index_for_insn(ctx, insn))) {
+		pr_err("bpf-jit: bpf jump label is out of range.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Based on input "insn", consult "ctx->bpf2insn" to get the
+ * related index (offset) of the translation in JIT stream.
+ */
+static u32 get_curr_jit_off(const struct jit_context *ctx,
+			    const struct bpf_insn *insn)
+{
+	const s32 idx = get_index_for_insn(ctx, insn);
+#ifdef ARC_BPF_JIT_DEBUG
+	BUG_ON(!offsets_available(ctx) || !check_insn_idx_valid(ctx, idx));
+#endif
+	return ctx->bpf2insn[idx];
+}
+
+/*
+ * The input "insn" must be a jump instruction.
+ *
+ * Based on input "insn", consult "ctx->bpf2insn" to get the
+ * related JIT index (offset) of "target instruction" that
+ * "insn" would jump to.
+ */
+static u32 get_targ_jit_off(const struct jit_context *ctx,
+			    const struct bpf_insn *insn)
+{
+	const s32 tidx = get_target_index_for_insn(ctx, insn);
+#ifdef ARC_BPF_JIT_DEBUG
+	BUG_ON(!offsets_available(ctx) || !check_insn_idx_valid(ctx, tidx));
+#endif
+	return ctx->bpf2insn[tidx];
+}
+
+/*
+ * This function will return 0 for a feasible jump.
+ *
+ * Consult the back-end to check if it finds it feasible to emit
+ * the necessary instructions based on "cond" and the displacement
+ * between the "from_off" and the "to_off".
+ */
+static int feasible_jit_jump(u32 from_off, u32 to_off, u8 cond, bool j32)
+{
+	int ret = 0;
+
+	if (j32) {
+		if (!check_jmp_32(from_off, to_off, cond))
+			ret = -EFAULT;
+	} else {
+		if (!check_jmp_64(from_off, to_off, cond))
+			ret = -EFAULT;
+	}
+
+	if (ret != 0)
+		pr_err("bpf-jit: the JIT displacement is not OK.\n");
+
+	return ret;
+}
+
+/*
+ * This jump handler performs the following steps:
+ *
+ * 1. Compute ARC's internal condition code from BPF's
+ * 2. Determine the bitness of the operation (32 vs. 64)
+ * 3. Sanity check on BPF stream
+ * 4. Sanity check on what is supposed to be JIT's displacement
+ * 5. And finally, emit the necessary instructions
+ *
+ * The last two steps are performed through the back-end.
+ * The value of steps 1 and 2 are necessary inputs for the back-end.
+ */
+static int handle_jumps(const struct jit_context *ctx,
+			const struct bpf_insn *insn,
+			u8 *len)
+{
+	u8 cond;
+	int ret = 0;
+	u8 *buf = effective_jit_buf(ctx);
+	const bool j32 = (BPF_CLASS(insn->code) == BPF_JMP32) ? true : false;
+	const u8 rd = insn->dst_reg;
+	u8 rs = insn->src_reg;
+	u32 curr_off = 0, targ_off = 0;
+
+	*len = 0;
+
+	/* Map the BPF condition to internal enum. */
+	CHECK_RET(bpf_cond_to_arc(BPF_OP(insn->code), &cond));
+
+	/* Sanity check on the BPF byte stream. */
+	CHECK_RET(check_bpf_jump(ctx, insn));
+
+	/*
+	 * Move the immediate into a temporary register _now_ for 2 reasons:
+	 *
+	 * 1. "gen_jmp_{32,64}()" deal with operands in registers.
+	 *
+	 * 2. The "len" parameter will grow so that the current jit offset
+	 *    (curr_off) will have increased to a point where the necessary
+	 *    instructions can be inserted by "gen_jmp_{32,64}()".
+	 */
+	if (has_imm(insn) && cond != ARC_CC_AL) {
+		if (j32) {
+			*len += mov_r32_i32(BUF(buf, *len), JIT_REG_TMP,
+					    insn->imm);
+		} else {
+			*len += mov_r64_i32(BUF(buf, *len), JIT_REG_TMP,
+					    insn->imm);
+		}
+		rs = JIT_REG_TMP;
+	}
+
+	/* If the offsets are known, check if the branch can occur. */
+	if (offsets_available(ctx)) {
+		curr_off = get_curr_jit_off(ctx, insn) + *len;
+		targ_off = get_targ_jit_off(ctx, insn);
+
+		/* Sanity check on the back-end side. */
+		CHECK_RET(feasible_jit_jump(curr_off, targ_off, cond, j32));
+	}
+
+	if (j32) {
+		*len += gen_jmp_32(BUF(buf, *len), rd, rs, cond,
+				   curr_off, targ_off);
+	} else {
+		*len += gen_jmp_64(BUF(buf, *len), rd, rs, cond,
+				   curr_off, targ_off);
+	}
+
+	return ret;
+}
+
+/* Jump to translated epilogue address. */
+static int handle_jmp_epilogue(struct jit_context *ctx,
+			       const struct bpf_insn *insn, u8 *len)
+{
+	u8 *buf = effective_jit_buf(ctx);
+	u32 curr_off = 0, epi_off = 0;
+
+	/* Check the offset only if the data is available. */
+	if (offsets_available(ctx)) {
+		curr_off = get_curr_jit_off(ctx, insn);
+		epi_off = ctx->epilogue_offset;
+
+		if (!check_jmp_64(curr_off, epi_off, ARC_CC_AL)) {
+			pr_err("bpf-jit: epilogue offset is not valid.\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Jump to "epilogue offset" (rd and rs don't matter). */
+	*len = gen_jmp_64(buf, 0, 0, ARC_CC_AL, curr_off, epi_off);
+
+	return 0;
+}
+
+/* Try to get the resolved address and generate the instructions. */
+static int handle_call(struct jit_context *ctx,
+		       const struct bpf_insn *insn,
+		       u8 *len)
+{
+	int  ret;
+	bool in_kernel_func, fixed = false;
+	u64  addr = 0;
+	u8  *buf = effective_jit_buf(ctx);
+
+	ret = bpf_jit_get_func_addr(ctx->prog, insn, ctx->is_extra_pass,
+				    &addr, &fixed);
+	if (ret < 0) {
+		pr_err("bpf-jit: can't get the address for call.\n");
+		return ret;
+	}
+	in_kernel_func = (fixed ? true : false);
+
+	/* No valuable address retrieved (yet). */
+	if (!fixed && !addr)
+		set_need_for_extra_pass(ctx);
+
+	*len = gen_func_call(buf, (ARC_ADDR)addr, in_kernel_func);
+
+	if (insn->src_reg != BPF_PSEUDO_CALL) {
+		/* Assigning ABI's return reg to JIT's return reg. */
+		*len += arc_to_bpf_return(BUF(buf, *len));
+	}
+
+	return 0;
+}
+
+/*
+ * Try to generate instructions for loading a 64-bit immediate.
+ * These sort of instructions are usually associated with the 64-bit
+ * relocations: R_BPF_64_64. Therefore, signal the need for an extra
+ * pass if the circumstances are right.
+ */
+static int handle_ld_imm64(struct jit_context *ctx,
+			   const struct bpf_insn *insn,
+			   u8 *len)
+{
+	const s32 idx = get_index_for_insn(ctx, insn);
+	u8 *buf = effective_jit_buf(ctx);
+
+	/* We're about to consume 2 VM instructions. */
+	if (is_last_insn(ctx->prog, idx)) {
+		pr_err("bpf-jit: need more data for 64-bit immediate.\n");
+		return -EINVAL;
+	}
+
+	*len = mov_r64_i64(buf, insn->dst_reg, insn->imm, (insn + 1)->imm);
+
+	if (bpf_pseudo_func(insn))
+		set_need_for_extra_pass(ctx);
+
+	return 0;
+}
+
+/*
+ * Handles one eBPF instruction at a time. To make this function faster,
+ * it does not call "jit_buffer_check()". Else, it would call it for every
+ * instruction. As a result, it should not be invoked directly. Only
+ * "handle_body()", that has already executed the "check", may call this
+ * function.
+ *
+ * If the "ret" value is negative, something has went wrong. Else,
+ * it mostly holds the value 0 and rarely 1. Number 1 signals
+ * the loop in "handle_body()" to skip the next instruction, because
+ * it has been consumed as part of a 64-bit immediate value.
+ */
+static int handle_insn(struct jit_context *ctx, u32 idx)
+{
+	const struct bpf_insn *insn = &ctx->prog->insnsi[idx];
+	const u8  code = insn->code;
+	const u8  dst  = insn->dst_reg;
+	const u8  src  = insn->src_reg;
+	const s16 off  = insn->off;
+	const s32 imm  = insn->imm;
+	u8 *buf = effective_jit_buf(ctx);
+	u8  len = 0;
+	int ret = 0;
+
+	switch (code) {
+	/* dst += src (32-bit) */
+	case BPF_ALU | BPF_ADD | BPF_X:
+		len = add_r32(buf, dst, src);
+		break;
+	/* dst += imm (32-bit) */
+	case BPF_ALU | BPF_ADD | BPF_K:
+		len = add_r32_i32(buf, dst, imm);
+		break;
+	/* dst -= src (32-bit) */
+	case BPF_ALU | BPF_SUB | BPF_X:
+		len = sub_r32(buf, dst, src);
+		break;
+	/* dst -= imm (32-bit) */
+	case BPF_ALU | BPF_SUB | BPF_K:
+		len = sub_r32_i32(buf, dst, imm);
+		break;
+	/* dst = -dst (32-bit) */
+	case BPF_ALU | BPF_NEG:
+		len = neg_r32(buf, dst);
+		break;
+	/* dst *= src (32-bit) */
+	case BPF_ALU | BPF_MUL | BPF_X:
+		len = mul_r32(buf, dst, src);
+		break;
+	/* dst *= imm (32-bit) */
+	case BPF_ALU | BPF_MUL | BPF_K:
+		len = mul_r32_i32(buf, dst, imm);
+		break;
+	/* dst /= src (32-bit) */
+	case BPF_ALU | BPF_DIV | BPF_X:
+		len = div_r32(buf, dst, src, off == 1);
+		break;
+	/* dst /= imm (32-bit) */
+	case BPF_ALU | BPF_DIV | BPF_K:
+		len = div_r32_i32(buf, dst, imm, off == 1);
+		break;
+	/* dst %= src (32-bit) */
+	case BPF_ALU | BPF_MOD | BPF_X:
+		len = mod_r32(buf, dst, src, off == 1);
+		break;
+	/* dst %= imm (32-bit) */
+	case BPF_ALU | BPF_MOD | BPF_K:
+		len = mod_r32_i32(buf, dst, imm, off == 1);
+		break;
+	/* dst &= src (32-bit) */
+	case BPF_ALU | BPF_AND | BPF_X:
+		len = and_r32(buf, dst, src);
+		break;
+	/* dst &= imm (32-bit) */
+	case BPF_ALU | BPF_AND | BPF_K:
+		len = and_r32_i32(buf, dst, imm);
+		break;
+	/* dst |= src (32-bit) */
+	case BPF_ALU | BPF_OR | BPF_X:
+		len = or_r32(buf, dst, src);
+		break;
+	/* dst |= imm (32-bit) */
+	case BPF_ALU | BPF_OR | BPF_K:
+		len = or_r32_i32(buf, dst, imm);
+		break;
+	/* dst ^= src (32-bit) */
+	case BPF_ALU | BPF_XOR | BPF_X:
+		len = xor_r32(buf, dst, src);
+		break;
+	/* dst ^= imm (32-bit) */
+	case BPF_ALU | BPF_XOR | BPF_K:
+		len = xor_r32_i32(buf, dst, imm);
+		break;
+	/* dst <<= src (32-bit) */
+	case BPF_ALU | BPF_LSH | BPF_X:
+		len = lsh_r32(buf, dst, src);
+		break;
+	/* dst <<= imm (32-bit) */
+	case BPF_ALU | BPF_LSH | BPF_K:
+		len = lsh_r32_i32(buf, dst, imm);
+		break;
+	/* dst >>= src (32-bit) [unsigned] */
+	case BPF_ALU | BPF_RSH | BPF_X:
+		len = rsh_r32(buf, dst, src);
+		break;
+	/* dst >>= imm (32-bit) [unsigned] */
+	case BPF_ALU | BPF_RSH | BPF_K:
+		len = rsh_r32_i32(buf, dst, imm);
+		break;
+	/* dst >>= src (32-bit) [signed] */
+	case BPF_ALU | BPF_ARSH | BPF_X:
+		len = arsh_r32(buf, dst, src);
+		break;
+	/* dst >>= imm (32-bit) [signed] */
+	case BPF_ALU | BPF_ARSH | BPF_K:
+		len = arsh_r32_i32(buf, dst, imm);
+		break;
+	/* dst = src (32-bit) */
+	case BPF_ALU | BPF_MOV | BPF_X:
+		len = mov_r32(buf, dst, src, (u8)off);
+		break;
+	/* dst = imm32 (32-bit) */
+	case BPF_ALU | BPF_MOV | BPF_K:
+		len = mov_r32_i32(buf, dst, imm);
+		break;
+	/* dst = swap(dst) */
+	case BPF_ALU   | BPF_END | BPF_FROM_LE:
+	case BPF_ALU   | BPF_END | BPF_FROM_BE:
+	case BPF_ALU64 | BPF_END | BPF_FROM_LE: {
+		CHECK_RET(handle_swap(buf, dst, imm, BPF_SRC(code),
+				      BPF_CLASS(code) == BPF_ALU64,
+				      ctx->do_zext, &len));
+		break;
+	}
+	/* dst += src (64-bit) */
+	case BPF_ALU64 | BPF_ADD | BPF_X:
+		len = add_r64(buf, dst, src);
+		break;
+	/* dst += imm32 (64-bit) */
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+		len = add_r64_i32(buf, dst, imm);
+		break;
+	/* dst -= src (64-bit) */
+	case BPF_ALU64 | BPF_SUB | BPF_X:
+		len = sub_r64(buf, dst, src);
+		break;
+	/* dst -= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+		len = sub_r64_i32(buf, dst, imm);
+		break;
+	/* dst = -dst (64-bit) */
+	case BPF_ALU64 | BPF_NEG:
+		len = neg_r64(buf, dst);
+		break;
+	/* dst *= src (64-bit) */
+	case BPF_ALU64 | BPF_MUL | BPF_X:
+		len = mul_r64(buf, dst, src);
+		break;
+	/* dst *= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+		len = mul_r64_i32(buf, dst, imm);
+		break;
+	/* dst &= src (64-bit) */
+	case BPF_ALU64 | BPF_AND | BPF_X:
+		len = and_r64(buf, dst, src);
+		break;
+	/* dst &= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_AND | BPF_K:
+		len = and_r64_i32(buf, dst, imm);
+		break;
+	/* dst |= src (64-bit) */
+	case BPF_ALU64 | BPF_OR | BPF_X:
+		len = or_r64(buf, dst, src);
+		break;
+	/* dst |= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_OR | BPF_K:
+		len = or_r64_i32(buf, dst, imm);
+		break;
+	/* dst ^= src (64-bit) */
+	case BPF_ALU64 | BPF_XOR | BPF_X:
+		len = xor_r64(buf, dst, src);
+		break;
+	/* dst ^= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+		len = xor_r64_i32(buf, dst, imm);
+		break;
+	/* dst <<= src (64-bit) */
+	case BPF_ALU64 | BPF_LSH | BPF_X:
+		len = lsh_r64(buf, dst, src);
+		break;
+	/* dst <<= imm32 (64-bit) */
+	case BPF_ALU64 | BPF_LSH | BPF_K:
+		len = lsh_r64_i32(buf, dst, imm);
+		break;
+	/* dst >>= src (64-bit) [unsigned] */
+	case BPF_ALU64 | BPF_RSH | BPF_X:
+		len = rsh_r64(buf, dst, src);
+		break;
+	/* dst >>= imm32 (64-bit) [unsigned] */
+	case BPF_ALU64 | BPF_RSH | BPF_K:
+		len = rsh_r64_i32(buf, dst, imm);
+		break;
+	/* dst >>= src (64-bit) [signed] */
+	case BPF_ALU64 | BPF_ARSH | BPF_X:
+		len = arsh_r64(buf, dst, src);
+		break;
+	/* dst >>= imm32 (64-bit) [signed] */
+	case BPF_ALU64 | BPF_ARSH | BPF_K:
+		len = arsh_r64_i32(buf, dst, imm);
+		break;
+	/* dst = src (64-bit) */
+	case BPF_ALU64 | BPF_MOV | BPF_X:
+		len = mov_r64(buf, dst, src, (u8)off);
+		break;
+	/* dst = imm32 (sign extend to 64-bit) */
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+		len = mov_r64_i32(buf, dst, imm);
+		break;
+	/* dst = imm64 */
+	case BPF_LD | BPF_DW | BPF_IMM:
+		CHECK_RET(handle_ld_imm64(ctx, insn, &len));
+		/* Tell the loop to skip the next instruction. */
+		ret = 1;
+		break;
+	/* dst = *(size *)(src + off) */
+	case BPF_LDX | BPF_MEM | BPF_W:
+	case BPF_LDX | BPF_MEM | BPF_H:
+	case BPF_LDX | BPF_MEM | BPF_B:
+	case BPF_LDX | BPF_MEM | BPF_DW:
+		len = load_r(buf, dst, src, off, BPF_SIZE(code), false);
+		break;
+	case BPF_LDX | BPF_MEMSX | BPF_W:
+	case BPF_LDX | BPF_MEMSX | BPF_H:
+	case BPF_LDX | BPF_MEMSX | BPF_B:
+		len = load_r(buf, dst, src, off, BPF_SIZE(code), true);
+		break;
+	/* *(size *)(dst + off) = src */
+	case BPF_STX | BPF_MEM | BPF_W:
+	case BPF_STX | BPF_MEM | BPF_H:
+	case BPF_STX | BPF_MEM | BPF_B:
+	case BPF_STX | BPF_MEM | BPF_DW:
+		len = store_r(buf, src, dst, off, BPF_SIZE(code));
+		break;
+	case BPF_ST | BPF_MEM | BPF_W:
+	case BPF_ST | BPF_MEM | BPF_H:
+	case BPF_ST | BPF_MEM | BPF_B:
+	case BPF_ST | BPF_MEM | BPF_DW:
+		len = store_i(buf, imm, dst, off, BPF_SIZE(code));
+		break;
+	case BPF_JMP   | BPF_JA:
+	case BPF_JMP   | BPF_JEQ  | BPF_X:
+	case BPF_JMP   | BPF_JEQ  | BPF_K:
+	case BPF_JMP   | BPF_JNE  | BPF_X:
+	case BPF_JMP   | BPF_JNE  | BPF_K:
+	case BPF_JMP   | BPF_JSET | BPF_X:
+	case BPF_JMP   | BPF_JSET | BPF_K:
+	case BPF_JMP   | BPF_JGT  | BPF_X:
+	case BPF_JMP   | BPF_JGT  | BPF_K:
+	case BPF_JMP   | BPF_JGE  | BPF_X:
+	case BPF_JMP   | BPF_JGE  | BPF_K:
+	case BPF_JMP   | BPF_JSGT | BPF_X:
+	case BPF_JMP   | BPF_JSGT | BPF_K:
+	case BPF_JMP   | BPF_JSGE | BPF_X:
+	case BPF_JMP   | BPF_JSGE | BPF_K:
+	case BPF_JMP   | BPF_JLT  | BPF_X:
+	case BPF_JMP   | BPF_JLT  | BPF_K:
+	case BPF_JMP   | BPF_JLE  | BPF_X:
+	case BPF_JMP   | BPF_JLE  | BPF_K:
+	case BPF_JMP   | BPF_JSLT | BPF_X:
+	case BPF_JMP   | BPF_JSLT | BPF_K:
+	case BPF_JMP   | BPF_JSLE | BPF_X:
+	case BPF_JMP   | BPF_JSLE | BPF_K:
+	case BPF_JMP32 | BPF_JA:
+	case BPF_JMP32 | BPF_JEQ  | BPF_X:
+	case BPF_JMP32 | BPF_JEQ  | BPF_K:
+	case BPF_JMP32 | BPF_JNE  | BPF_X:
+	case BPF_JMP32 | BPF_JNE  | BPF_K:
+	case BPF_JMP32 | BPF_JSET | BPF_X:
+	case BPF_JMP32 | BPF_JSET | BPF_K:
+	case BPF_JMP32 | BPF_JGT  | BPF_X:
+	case BPF_JMP32 | BPF_JGT  | BPF_K:
+	case BPF_JMP32 | BPF_JGE  | BPF_X:
+	case BPF_JMP32 | BPF_JGE  | BPF_K:
+	case BPF_JMP32 | BPF_JSGT | BPF_X:
+	case BPF_JMP32 | BPF_JSGT | BPF_K:
+	case BPF_JMP32 | BPF_JSGE | BPF_X:
+	case BPF_JMP32 | BPF_JSGE | BPF_K:
+	case BPF_JMP32 | BPF_JLT  | BPF_X:
+	case BPF_JMP32 | BPF_JLT  | BPF_K:
+	case BPF_JMP32 | BPF_JLE  | BPF_X:
+	case BPF_JMP32 | BPF_JLE  | BPF_K:
+	case BPF_JMP32 | BPF_JSLT | BPF_X:
+	case BPF_JMP32 | BPF_JSLT | BPF_K:
+	case BPF_JMP32 | BPF_JSLE | BPF_X:
+	case BPF_JMP32 | BPF_JSLE | BPF_K:
+		CHECK_RET(handle_jumps(ctx, insn, &len));
+		break;
+	case BPF_JMP | BPF_CALL:
+		CHECK_RET(handle_call(ctx, insn, &len));
+		break;
+
+	case BPF_JMP | BPF_EXIT:
+		/* If this is the last instruction, epilogue will follow. */
+		if (is_last_insn(ctx->prog, idx))
+			break;
+		CHECK_RET(handle_jmp_epilogue(ctx, insn, &len));
+		break;
+	default:
+		pr_err("bpf-jit: can't handle instruction code 0x%02X\n", code);
+		return -EOPNOTSUPP;
+	}
+
+	if (BPF_CLASS(code) == BPF_ALU) {
+		/*
+		 * Skip the "swap" instructions. Even 64-bit swaps are of type
+		 * BPF_ALU (and not BPF_ALU64). Therefore, for the swaps, one
+		 * has to look at the "size" of the operations rather than the
+		 * ALU type. "gen_swap()" specifically takes care of that.
+		 */
+		if (BPF_OP(code) != BPF_END && ctx->do_zext)
+			len += zext(BUF(buf, len), dst);
+	}
+
+	jit_buffer_update(ctx, len);
+
+	return ret;
+}
+
+static int handle_body(struct jit_context *ctx)
+{
+	int ret;
+	bool populate_bpf2insn = false;
+	const struct bpf_prog *prog = ctx->prog;
+
+	CHECK_RET(jit_buffer_check(ctx));
+
+	/*
+	 * Record the mapping for the instructions during the dry-run.
+	 * Doing it this way allows us to have the mapping ready for
+	 * the jump instructions during the real compilation phase.
+	 */
+	if (!ctx->emit)
+		populate_bpf2insn = true;
+
+	for (u32 i = 0; i < prog->len; i++) {
+		/* During the dry-run, jit.len grows gradually per BPF insn. */
+		if (populate_bpf2insn)
+			ctx->bpf2insn[i] = ctx->jit.len;
+
+		CHECK_RET(handle_insn(ctx, i));
+		if (ret > 0) {
+			/* "ret" is 1 if two (64-bit) chunks were consumed. */
+			ctx->bpf2insn[i + 1] = ctx->bpf2insn[i];
+			i++;
+		}
+	}
+
+	/* If bpf2insn had to be populated, then it is done at this point. */
+	if (populate_bpf2insn)
+		ctx->bpf2insn_valid = true;
+
+	return 0;
+}
+
+/*
+ * Initialize the memory with "unimp_s" which is the mnemonic for
+ * "unimplemented" instruction and always raises an exception.
+ *
+ * The instruction is 2 bytes. If "size" is odd, there is not much
+ * that can be done about the last byte in "area". Because, the
+ * CPU always fetches instructions in two bytes. Therefore, the
+ * byte beyond the last one is going to accompany it during a
+ * possible fetch. In the most likely case of a little endian
+ * system, that beyond-byte will become the major opcode and
+ * we have no control over its initialisation.
+ */
+static void fill_ill_insn(void *area, unsigned int size)
+{
+	const u16 unimp_s = 0x79e0;
+
+	if (size & 1) {
+		*((u8 *)area + (size - 1)) = 0xff;
+		size -= 1;
+	}
+
+	memset16(area, unimp_s, size >> 1);
+}
+
+/* Piece of memory that can be allocated at the beginning of jit_prepare(). */
+static int jit_prepare_early_mem_alloc(struct jit_context *ctx)
+{
+	ctx->bpf2insn = kcalloc(ctx->prog->len, sizeof(ctx->jit.len),
+				GFP_KERNEL);
+
+	if (!ctx->bpf2insn) {
+		pr_err("bpf-jit: could not allocate memory for "
+		       "mapping of the instructions.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Memory allocations that rely on parameters known at the end of
+ * jit_prepare().
+ */
+static int jit_prepare_final_mem_alloc(struct jit_context *ctx)
+{
+	const size_t alignment = sizeof(u32);
+
+	ctx->bpf_header = bpf_jit_binary_alloc(ctx->jit.len, &ctx->jit.buf,
+					       alignment, fill_ill_insn);
+	if (!ctx->bpf_header) {
+		pr_err("bpf-jit: could not allocate memory for translation.\n");
+		return -ENOMEM;
+	}
+
+	if (ctx->need_extra_pass) {
+		ctx->jit_data = kzalloc(sizeof(*ctx->jit_data), GFP_KERNEL);
+		if (!ctx->jit_data)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * The first phase of the translation without actually emitting any
+ * instruction. It helps in getting a forecast on some aspects, such
+ * as the length of the whole program or where the epilogue starts.
+ *
+ * Whenever the necessary parameters are known, memories are allocated.
+ */
+static int jit_prepare(struct jit_context *ctx)
+{
+	int ret;
+
+	/* Dry run. */
+	ctx->emit = false;
+
+	CHECK_RET(jit_prepare_early_mem_alloc(ctx));
+
+	/* Get the length of prologue section after some register analysis. */
+	analyze_reg_usage(ctx);
+	CHECK_RET(handle_prologue(ctx));
+
+	CHECK_RET(handle_body(ctx));
+
+	/* Record at which offset epilogue begins. */
+	ctx->epilogue_offset = ctx->jit.len;
+
+	/* Process the epilogue section now. */
+	CHECK_RET(handle_epilogue(ctx));
+
+	CHECK_RET(jit_prepare_final_mem_alloc(ctx));
+
+	return 0;
+}
+
+/*
+ * All the "handle_*()" functions have been called before by the
+ * "jit_prepare()". If there was an error, we would know by now.
+ * Therefore, no extra error checking at this point, other than
+ * a sanity check at the end that expects the calculated length
+ * (jit.len) to be equal to the length of generated instructions
+ * (jit.index).
+ */
+static int jit_compile(struct jit_context *ctx)
+{
+	int ret;
+
+	/* Let there be code. */
+	ctx->emit = true;
+
+	CHECK_RET(handle_prologue(ctx));
+
+	CHECK_RET(handle_body(ctx));
+
+	CHECK_RET(handle_epilogue(ctx));
+
+	if (ctx->jit.index != ctx->jit.len) {
+		pr_err("bpf-jit: divergence between the phases; "
+		       "%u vs. %u (bytes).\n",
+		       ctx->jit.len, ctx->jit.index);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Calling this function implies a successful JIT. A successful
+ * translation is signaled by setting the right parameters:
+ *
+ * prog->jited=1, prog->jited_len=..., prog->bpf_func=...
+ */
+static int jit_finalize(struct jit_context *ctx)
+{
+	struct bpf_prog *prog = ctx->prog;
+
+	/* We're going to need this information for the "do_extra_pass()". */
+	if (ctx->need_extra_pass) {
+		ctx->jit_data->bpf_header = ctx->bpf_header;
+		ctx->jit_data->bpf2insn = ctx->bpf2insn;
+		prog->aux->jit_data = (void *)ctx->jit_data;
+	} else {
+		/*
+		 * If things seem finalised, then mark the JITed memory
+		 * as R-X and flush it.
+		 */
+		if (bpf_jit_binary_lock_ro(ctx->bpf_header)) {
+			pr_err("bpf-jit: Could not lock the JIT memory.\n");
+			return -EFAULT;
+		}
+		flush_icache_range((unsigned long)ctx->bpf_header,
+				   (unsigned long)
+				   BUF(ctx->jit.buf, ctx->jit.len));
+		prog->aux->jit_data = NULL;
+		bpf_prog_fill_jited_linfo(prog, ctx->bpf2insn);
+	}
+
+	ctx->success = true;
+	prog->bpf_func = (void *)ctx->jit.buf;
+	prog->jited_len = ctx->jit.len;
+	prog->jited = 1;
+
+	jit_ctx_cleanup(ctx);
+	jit_dump(ctx);
+
+	return 0;
+}
+
+/*
+ * A lenient verification for the existence of JIT context in "prog".
+ * Apparently the JIT internals, namely jit_subprogs() in bpf/verifier.c,
+ * may request for a second compilation although nothing needs to be done.
+ */
+static inline int check_jit_context(const struct bpf_prog *prog)
+{
+	if (!prog->aux->jit_data) {
+		pr_notice("bpf-jit: no jit data for the extra pass.\n");
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+/* Reuse the previous pass's data. */
+static int jit_resume_context(struct jit_context *ctx)
+{
+	struct arc_jit_data *jdata =
+		(struct arc_jit_data *)ctx->prog->aux->jit_data;
+
+	if (!jdata) {
+		pr_err("bpf-jit: no jit data for the extra pass.\n");
+		return -EINVAL;
+	}
+
+	ctx->jit.buf = (u8 *)ctx->prog->bpf_func;
+	ctx->jit.len = ctx->prog->jited_len;
+	ctx->bpf_header = jdata->bpf_header;
+	ctx->bpf2insn = (u32 *)jdata->bpf2insn;
+	ctx->bpf2insn_valid = ctx->bpf2insn ? true : false;
+	ctx->jit_data = jdata;
+
+	return 0;
+}
+
+/*
+ * Patch in the new addresses. The instructions of interest are:
+ *
+ * - call
+ * - ld r64, imm64
+ *
+ * For "call"s, it resolves the addresses one more time through the
+ * handle_call().
+ *
+ * For 64-bit immediate loads, it just retranslates them, because the BPF
+ * core in kernel might have changed the value since the normal pass.
+ */
+static int jit_patch_relocations(struct jit_context *ctx)
+{
+	const u8 bpf_opc_call = BPF_JMP | BPF_CALL;
+	const u8 bpf_opc_ldi64 = BPF_LD | BPF_DW | BPF_IMM;
+	const struct bpf_prog *prog = ctx->prog;
+	int ret;
+
+	ctx->emit = true;
+	for (u32 i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		u8 dummy;
+		/*
+		 * Adjust "ctx.jit.index", so "gen_*()" functions below
+		 * can use it for their output addresses.
+		 */
+		ctx->jit.index = ctx->bpf2insn[i];
+
+		if (insn->code == bpf_opc_call) {
+			CHECK_RET(handle_call(ctx, insn, &dummy));
+		} else if (insn->code == bpf_opc_ldi64) {
+			CHECK_RET(handle_ld_imm64(ctx, insn, &dummy));
+			/* Skip the next instruction. */
+			++i;
+		}
+	}
+	return 0;
+}
+
+/*
+ * A normal pass that involves a "dry-run" phase, jit_prepare(),
+ * to get the necessary data for the real compilation phase,
+ * jit_compile().
+ */
+static struct bpf_prog *do_normal_pass(struct bpf_prog *prog)
+{
+	struct jit_context ctx;
+
+	/* Bail out if JIT is disabled. */
+	if (!prog->jit_requested)
+		return prog;
+
+	if (jit_ctx_init(&ctx, prog)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	/* Get the lengths and allocate buffer. */
+	if (jit_prepare(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	if (jit_compile(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	if (jit_finalize(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	return ctx.prog;
+}
+
+/*
+ * If there are multi-function BPF programs that call each other,
+ * their translated addresses are not known all at once. Therefore,
+ * an extra pass is needed to consult the bpf_jit_get_func_addr()
+ * again to get the newly translated addresses in order to resolve
+ * the "call"s.
+ */
+static struct bpf_prog *do_extra_pass(struct bpf_prog *prog)
+{
+	struct jit_context ctx;
+
+	/* Skip if there's no context to resume from. */
+	if (check_jit_context(prog))
+		return prog;
+
+	if (jit_ctx_init(&ctx, prog)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	if (jit_resume_context(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	if (jit_patch_relocations(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	if (jit_finalize(&ctx)) {
+		jit_ctx_cleanup(&ctx);
+		return prog;
+	}
+
+	return ctx.prog;
+}
+
+/*
+ * This function may be invoked twice for the same stream of BPF
+ * instructions. The "extra pass" happens, when there are "call"s
+ * involved that their addresses are not known during the first
+ * invocation.
+ */
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	vm_dump(prog);
+
+	/* Was this program already translated? */
+	if (!prog->jited)
+		return do_normal_pass(prog);
+	else
+		return do_extra_pass(prog);
+
+	return prog;
+}
diff --git a/arch/arm/boot/dts/st/stm32mp131.dtsi b/arch/arm/boot/dts/st/stm32mp131.dtsi
index ecfa120827ba..6704ceef284d 100644
--- a/arch/arm/boot/dts/st/stm32mp131.dtsi
+++ b/arch/arm/boot/dts/st/stm32mp131.dtsi
@@ -783,10 +783,82 @@
 		};
 
 		exti: interrupt-controller@5000d000 {
-			compatible = "st,stm32mp13-exti", "syscon";
+			compatible = "st,stm32mp1-exti", "syscon";
 			interrupt-controller;
 			#interrupt-cells = <2>;
 			reg = <0x5000d000 0x400>;
+			interrupts-extended =
+				<&intc GIC_SPI 6   IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_0 */
+				<&intc GIC_SPI 7   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 8   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 9   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 10  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 24  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 65  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 66  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 67  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 68  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 41  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_10 */
+				<&intc GIC_SPI 43  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 77  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 78  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 1   IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 3   IRQ_TYPE_LEVEL_HIGH>,
+				<0>,						/* EXTI_20 */
+				<&intc GIC_SPI 32  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 34  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 73  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 93  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 38  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 39  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 40  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 72  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 53  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_30 */
+				<&intc GIC_SPI 54  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 83  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 84  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_40 */
+				<0>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 96  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 92  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_50 */
+				<0>,
+				<&intc GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_60 */
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 63  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 98  IRQ_TYPE_LEVEL_HIGH>;	/* EXTI_70 */
 		};
 
 		syscfg: syscon@50020000 {
diff --git a/arch/arm/boot/dts/st/stm32mp151.dtsi b/arch/arm/boot/dts/st/stm32mp151.dtsi
index 16bd6eee32b4..90c5c72c87ab 100644
--- a/arch/arm/boot/dts/st/stm32mp151.dtsi
+++ b/arch/arm/boot/dts/st/stm32mp151.dtsi
@@ -176,6 +176,81 @@
 			interrupt-controller;
 			#interrupt-cells = <2>;
 			reg = <0x5000d000 0x400>;
+			interrupts-extended =
+				<&intc GIC_SPI 6   IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_0 */
+				<&intc GIC_SPI 7   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 8   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 9   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 10  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 23  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 64  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 65  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 66  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 67  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 40  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_10 */
+				<&intc GIC_SPI 42  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 76  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 77  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 1   IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 3   IRQ_TYPE_LEVEL_HIGH>,
+				<0>,						/* EXTI_20 */
+				<&intc GIC_SPI 31  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 33  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 72  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 95  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 37  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 38  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 39  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 71  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 52  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_30 */
+				<&intc GIC_SPI 53  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 82  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 83  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_40 */
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 151 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 93  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_50 */
+				<0>,
+				<&intc GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_60 */
+				<&intc GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 62  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_70 */
+				<0>,
+				<0>,
+				<&intc GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>;
 		};
 
 		syscfg: syscon@50020000 {
diff --git a/arch/arm/configs/collie_defconfig b/arch/arm/configs/collie_defconfig
index 01b5a5a73f03..42cb1c854118 100644
--- a/arch/arm/configs/collie_defconfig
+++ b/arch/arm/configs/collie_defconfig
@@ -3,7 +3,7 @@ CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_EXPERT=y
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_EPOLL is not set
 CONFIG_ARCH_MULTI_V4=y
 # CONFIG_ARCH_MULTI_V7 is not set
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index 59c4835ffc97..c1291ca290b2 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -12,7 +12,7 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EXPERT=y
 CONFIG_PROFILING=y
diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig
index d169da9b2824..f55c231e0870 100644
--- a/arch/arm/configs/lpc18xx_defconfig
+++ b/arch/arm/configs/lpc18xx_defconfig
@@ -8,7 +8,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_RD_LZ4 is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_UID16 is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index 1d41e73f4903..34d079e03b3c 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -6,7 +6,7 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_SIGNALFD is not set
 # CONFIG_TIMERFD is not set
 # CONFIG_EVENTFD is not set
diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig
index 3ed73f184d83..e995e50537ef 100644
--- a/arch/arm/configs/mps2_defconfig
+++ b/arch/arm/configs/mps2_defconfig
@@ -5,7 +5,7 @@ CONFIG_LOG_BUF_SHIFT=16
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EXPERT=y
 # CONFIG_UID16 is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig
index 729ea8157e2a..025b595dd837 100644
--- a/arch/arm/configs/omap1_defconfig
+++ b/arch/arm/configs/omap1_defconfig
@@ -9,7 +9,7 @@ CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_SHMEM is not set
 # CONFIG_KALLSYMS is not set
 CONFIG_PROFILING=y
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index b9fe3fbed5ae..3baec075d1ef 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -6,7 +6,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EXPERT=y
 # CONFIG_UID16 is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c
index 661b3fc43275..1e4cd502340e 100644
--- a/arch/arm/mach-pxa/devices.c
+++ b/arch/arm/mach-pxa/devices.c
@@ -7,7 +7,6 @@
 #include <linux/clk-provider.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/spi/pxa2xx_spi.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/soc/pxa/cpu.h>
 
@@ -665,23 +664,6 @@ struct platform_device pxa27x_device_gpio = {
 	.resource	= pxa_resource_gpio,
 };
 
-/* pxa2xx-spi platform-device ID equals respective SSP platform-device ID + 1.
- * See comment in arch/arm/mach-pxa/ssp.c::ssp_probe() */
-void __init pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info)
-{
-	struct platform_device *pd;
-
-	pd = platform_device_alloc("pxa2xx-spi", id);
-	if (pd == NULL) {
-		printk(KERN_ERR "pxa2xx-spi: failed to allocate device id %d\n",
-		       id);
-		return;
-	}
-
-	pd->dev.platform_data = info;
-	platform_device_add(pd);
-}
-
 static struct resource pxa_dma_resource[] = {
 	[0] = {
 		.start	= 0x40000000,
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index cc691b199429..3c5f5a3cb480 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -18,10 +18,10 @@
 #include <linux/i2c.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/platform_data/pca953x.h>
+#include <linux/property.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/ads7846.h>
 #include <linux/spi/corgi_lcd.h>
-#include <linux/spi/pxa2xx_spi.h>
 #include <linux/mtd/sharpsl.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input-event-codes.h>
@@ -569,10 +569,6 @@ static struct spi_board_info spitz_spi_devices[] = {
 	},
 };
 
-static struct pxa2xx_spi_controller spitz_spi_info = {
-	.num_chipselect	= 3,
-};
-
 static struct gpiod_lookup_table spitz_spi_gpio_table = {
 	.dev_id = "spi2",
 	.table = {
@@ -583,8 +579,21 @@ static struct gpiod_lookup_table spitz_spi_gpio_table = {
 	},
 };
 
+static const struct property_entry spitz_spi_properties[] = {
+	PROPERTY_ENTRY_U32("num-cs", 3),
+	{ }
+};
+
+static const struct software_node spitz_spi_node = {
+	.properties = spitz_spi_properties,
+};
+
 static void __init spitz_spi_init(void)
 {
+	struct platform_device *pd;
+	int id = 2;
+	int err;
+
 	if (machine_is_akita())
 		gpiod_add_lookup_table(&akita_lcdcon_gpio_table);
 	else
@@ -592,7 +601,21 @@ static void __init spitz_spi_init(void)
 
 	gpiod_add_lookup_table(&spitz_ads7846_gpio_table);
 	gpiod_add_lookup_table(&spitz_spi_gpio_table);
-	pxa2xx_set_spi_info(2, &spitz_spi_info);
+
+	/* pxa2xx-spi platform-device ID equals respective SSP platform-device ID + 1 */
+	pd = platform_device_alloc("pxa2xx-spi", id);
+	if (pd == NULL) {
+		pr_err("pxa2xx-spi: failed to allocate device id %d\n", id);
+	} else {
+		err = device_add_software_node(&pd->dev, &spitz_spi_node);
+		if (err) {
+			platform_device_put(pd);
+			pr_err("pxa2xx-spi: failed to add software node\n");
+		} else {
+			platform_device_add(pd);
+		}
+	}
+
 	spi_register_board_info(ARRAY_AND_SIZE(spitz_spi_devices));
 }
 #else
diff --git a/arch/arm/mach-pxa/spitz_pm.c b/arch/arm/mach-pxa/spitz_pm.c
index 8bc4ea51a0c1..03b4b347f11a 100644
--- a/arch/arm/mach-pxa/spitz_pm.c
+++ b/arch/arm/mach-pxa/spitz_pm.c
@@ -35,18 +35,20 @@
 
 static int spitz_last_ac_status;
 
-static struct gpio spitz_charger_gpios[] = {
-	{ SPITZ_GPIO_KEY_INT,	GPIOF_IN, "Keyboard Interrupt" },
-	{ SPITZ_GPIO_SYNC,	GPIOF_IN, "Sync" },
-	{ SPITZ_GPIO_AC_IN,     GPIOF_IN, "Charger Detection" },
-	{ SPITZ_GPIO_ADC_TEMP_ON, GPIOF_OUT_INIT_LOW, "ADC Temp On" },
-	{ SPITZ_GPIO_JK_B,	  GPIOF_OUT_INIT_LOW, "JK B" },
-	{ SPITZ_GPIO_CHRG_ON,	  GPIOF_OUT_INIT_LOW, "Charger On" },
-};
-
 static void spitz_charger_init(void)
 {
-	gpio_request_array(ARRAY_AND_SIZE(spitz_charger_gpios));
+	gpio_request(SPITZ_GPIO_KEY_INT, "Keyboard Interrupt");
+	gpio_direction_input(SPITZ_GPIO_KEY_INT);
+	gpio_request(SPITZ_GPIO_SYNC, "Sync");
+	gpio_direction_input(SPITZ_GPIO_SYNC);
+	gpio_request(SPITZ_GPIO_AC_IN, "Charger Detection");
+	gpio_direction_input(SPITZ_GPIO_AC_IN);
+	gpio_request(SPITZ_GPIO_ADC_TEMP_ON, "ADC Temp On");
+	gpio_direction_output(SPITZ_GPIO_ADC_TEMP_ON, 0);
+	gpio_request(SPITZ_GPIO_JK_B, "JK B");
+	gpio_direction_output(SPITZ_GPIO_JK_B, 0);
+	gpio_request(SPITZ_GPIO_CHRG_ON, "Charger On");
+	gpio_direction_output(SPITZ_GPIO_CHRG_ON, 0);
 }
 
 static void spitz_measure_temp(int on)
diff --git a/arch/arm/mach-sa1100/h3600.c b/arch/arm/mach-sa1100/h3600.c
index 5e25dfa752e9..1cfc0b1fa41c 100644
--- a/arch/arm/mach-sa1100/h3600.c
+++ b/arch/arm/mach-sa1100/h3600.c
@@ -20,16 +20,6 @@
 
 #include "generic.h"
 
-/*
- * helper for sa1100fb
- */
-static struct gpio h3600_lcd_gpio[] = {
-	{ H3XXX_EGPIO_LCD_ON,	GPIOF_OUT_INIT_LOW,	"LCD power" },
-	{ H3600_EGPIO_LCD_PCI,	GPIOF_OUT_INIT_LOW,	"LCD control" },
-	{ H3600_EGPIO_LCD_5V_ON, GPIOF_OUT_INIT_LOW,	"LCD 5v" },
-	{ H3600_EGPIO_LVDD_ON,	GPIOF_OUT_INIT_LOW,	"LCD 9v/-6.5v" },
-};
-
 static bool h3600_lcd_request(void)
 {
 	static bool h3600_lcd_ok;
@@ -38,7 +28,42 @@ static bool h3600_lcd_request(void)
 	if (h3600_lcd_ok)
 		return true;
 
-	rc = gpio_request_array(h3600_lcd_gpio, ARRAY_SIZE(h3600_lcd_gpio));
+	rc = gpio_request(H3XXX_EGPIO_LCD_ON, "LCD power");
+	if (rc)
+		goto out;
+	rc = gpio_direction_output(H3XXX_EGPIO_LCD_ON, 0);
+	if (rc)
+		goto out_free_on;
+	rc = gpio_request(H3600_EGPIO_LCD_PCI, "LCD control");
+	if (rc)
+		goto out_free_on;
+	rc = gpio_direction_output(H3600_EGPIO_LCD_PCI, 0);
+	if (rc)
+		goto out_free_pci;
+	rc = gpio_request(H3600_EGPIO_LCD_5V_ON, "LCD 5v");
+	if (rc)
+		goto out_free_pci;
+	rc = gpio_direction_output(H3600_EGPIO_LCD_5V_ON, 0);
+	if (rc)
+		goto out_free_5v_on;
+	rc = gpio_request(H3600_EGPIO_LVDD_ON, "LCD 9v/-6.5v");
+	if (rc)
+		goto out_free_5v_on;
+	rc = gpio_direction_output(H3600_EGPIO_LVDD_ON, 0);
+	if (rc)
+		goto out_free_lvdd_on;
+
+	goto out;
+
+out_free_lvdd_on:
+	gpio_free(H3600_EGPIO_LVDD_ON);
+out_free_5v_on:
+	gpio_free(H3600_EGPIO_LCD_5V_ON);
+out_free_pci:
+	gpio_free(H3600_EGPIO_LCD_PCI);
+out_free_on:
+	gpio_free(H3XXX_EGPIO_LCD_ON);
+out:
 	if (rc)
 		pr_err("%s: can't request GPIOs\n", __func__);
 	else
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 72b5cd697f5d..deeb8f292454 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -2252,28 +2252,21 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	/* If building the body of the JITed code fails somehow,
 	 * we fall back to the interpretation.
 	 */
-	if (build_body(&ctx) < 0) {
-		image_ptr = NULL;
-		bpf_jit_binary_free(header);
-		prog = orig_prog;
-		goto out_imms;
-	}
+	if (build_body(&ctx) < 0)
+		goto out_free;
 	build_epilogue(&ctx);
 
 	/* 3.) Extra pass to validate JITed Code */
-	if (validate_code(&ctx)) {
-		image_ptr = NULL;
-		bpf_jit_binary_free(header);
-		prog = orig_prog;
-		goto out_imms;
-	}
+	if (validate_code(&ctx))
+		goto out_free;
 	flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
 
 	if (bpf_jit_enable > 1)
 		/* there are 2 passes here */
 		bpf_jit_dump(prog->len, image_size, 2, ctx.target);
 
-	bpf_jit_binary_lock_ro(header);
+	if (bpf_jit_binary_lock_ro(header))
+		goto out_free;
 	prog->bpf_func = (void *)ctx.target;
 	prog->jited = 1;
 	prog->jited_len = image_size;
@@ -2290,5 +2283,11 @@ out:
 		bpf_jit_prog_release_other(prog, prog == orig_prog ?
 					   tmp : orig_prog);
 	return prog;
+
+out_free:
+	image_ptr = NULL;
+	bpf_jit_binary_free(header);
+	prog = orig_prog;
+	goto out_imms;
 }
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 74b34a78b7ac..2df0818c3ca9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -256,9 +256,11 @@ config ARM64
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
+	select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD
 	select TRACE_IRQFLAGS_SUPPORT
 	select TRACE_IRQFLAGS_NMI_SUPPORT
 	select HAVE_SOFTIRQ_ON_OWN_STACK
+	select USER_STACKTRACE_SUPPORT
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index a028ea312378..a52618073de2 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -309,6 +309,7 @@ config ARCH_STM32
 	select GPIOLIB
 	select PINCTRL
 	select PINCTRL_STM32MP257
+	select STM32_EXTI
 	select ARM_SMC_MBOX
 	select ARM_SCMI_PROTOCOL
 	select COMMON_CLK_SCMI
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 0e075d3c546b..b8b1d4f4a572 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -154,6 +154,10 @@ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
 # Default target when executing plain make
 boot		:= arch/arm64/boot
 
+BOOT_TARGETS	:= Image vmlinuz.efi image.fit
+
+PHONY += $(BOOT_TARGETS)
+
 ifeq ($(CONFIG_EFI_ZBOOT),)
 KBUILD_IMAGE	:= $(boot)/Image.gz
 else
@@ -162,8 +166,10 @@ endif
 
 all:	$(notdir $(KBUILD_IMAGE))
 
-vmlinuz.efi: Image
-Image vmlinuz.efi: vmlinux
+image.fit: dtbs
+
+vmlinuz.efi image.fit: Image
+$(BOOT_TARGETS): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
 Image.%: Image
@@ -215,6 +221,7 @@ virtconfig:
 define archhelp
   echo  '* Image.gz      - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)'
   echo  '  Image         - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
+  echo  '  image.fit     - Flat Image Tree (arch/$(ARCH)/boot/image.fit)'
   echo  '  install       - Install uncompressed kernel'
   echo  '  zinstall      - Install compressed kernel'
   echo  '                  Install using (your) ~/bin/installkernel or'
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index af5dc61f8b43..abaae9de1bdd 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -2,3 +2,4 @@
 Image
 Image.gz
 vmlinuz*
+image.fit
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index a5a787371117..607a67a649c4 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -16,7 +16,8 @@
 
 OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 
-targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo Image.zst
+targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo \
+	Image.zst image.fit
 
 $(obj)/Image: vmlinux FORCE
 	$(call if_changed,objcopy)
@@ -39,6 +40,9 @@ $(obj)/Image.lzo: $(obj)/Image FORCE
 $(obj)/Image.zst: $(obj)/Image FORCE
 	$(call if_changed,zstd)
 
+$(obj)/image.fit: $(obj)/Image $(obj)/dts/dtbs-list FORCE
+	$(call if_changed,fit)
+
 EFI_ZBOOT_PAYLOAD	:= Image
 EFI_ZBOOT_BFD_TARGET	:= elf64-littleaarch64
 EFI_ZBOOT_MACH_TYPE	:= ARM64
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
index af421ba24ce0..d12b01c5f41b 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi
@@ -6,6 +6,7 @@
 /dts-v1/;
 
 #include "sun50i-h616.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
@@ -62,6 +63,10 @@
 	};
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
 &mmc0 {
 	vmmc-supply = <&reg_dldo1>;
 	/* Card detection pin is not connected */
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi
new file mode 100644
index 000000000000..aca22a7f0191
--- /dev/null
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+// Copyright (C) 2023 Martin Botka <martin@somainline.org>
+
+/ {
+	cpu_opp_table: opp-table-cpu {
+		compatible = "allwinner,sun50i-h616-operating-points";
+		nvmem-cells = <&cpu_speed_grade>;
+		opp-shared;
+
+		opp-480000000 {
+			opp-hz = /bits/ 64 <480000000>;
+			opp-microvolt = <900000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x1f>;
+		};
+
+		opp-600000000 {
+			opp-hz = /bits/ 64 <600000000>;
+			opp-microvolt = <900000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x12>;
+		};
+
+		opp-720000000 {
+			opp-hz = /bits/ 64 <720000000>;
+			opp-microvolt = <900000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x0d>;
+		};
+
+		opp-792000000 {
+			opp-hz = /bits/ 64 <792000000>;
+			opp-microvolt-speed1 = <900000>;
+			opp-microvolt-speed4 = <940000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x12>;
+		};
+
+		opp-936000000 {
+			opp-hz = /bits/ 64 <936000000>;
+			opp-microvolt = <900000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x0d>;
+		};
+
+		opp-1008000000 {
+			opp-hz = /bits/ 64 <1008000000>;
+			opp-microvolt-speed0 = <950000>;
+			opp-microvolt-speed1 = <940000>;
+			opp-microvolt-speed2 = <950000>;
+			opp-microvolt-speed3 = <950000>;
+			opp-microvolt-speed4 = <1020000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x1f>;
+		};
+
+		opp-1104000000 {
+			opp-hz = /bits/ 64 <1104000000>;
+			opp-microvolt-speed0 = <1000000>;
+			opp-microvolt-speed2 = <1000000>;
+			opp-microvolt-speed3 = <1000000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x0d>;
+		};
+
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt-speed0 = <1050000>;
+			opp-microvolt-speed1 = <1020000>;
+			opp-microvolt-speed2 = <1050000>;
+			opp-microvolt-speed3 = <1050000>;
+			opp-microvolt-speed4 = <1100000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x1f>;
+		};
+
+		opp-1320000000 {
+			opp-hz = /bits/ 64 <1320000000>;
+			opp-microvolt = <1100000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x1d>;
+		};
+
+		opp-1416000000 {
+			opp-hz = /bits/ 64 <1416000000>;
+			opp-microvolt = <1100000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x0d>;
+		};
+
+		opp-1512000000 {
+			opp-hz = /bits/ 64 <1512000000>;
+			opp-microvolt-speed1 = <1100000>;
+			opp-microvolt-speed3 = <1100000>;
+			clock-latency-ns = <244144>; /* 8 32k periods */
+			opp-supported-hw = <0x0a>;
+		};
+	};
+};
+
+&cpu0 {
+	operating-points-v2 = <&cpu_opp_table>;
+};
+
+&cpu1 {
+	operating-points-v2 = <&cpu_opp_table>;
+};
+
+&cpu2 {
+	operating-points-v2 = <&cpu_opp_table>;
+};
+
+&cpu3 {
+	operating-points-v2 = <&cpu_opp_table>;
+};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts
index b5d713926a34..a360d8567f95 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts
@@ -6,12 +6,17 @@
 /dts-v1/;
 
 #include "sun50i-h616-orangepi-zero.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 / {
 	model = "OrangePi Zero2";
 	compatible = "xunlong,orangepi-zero2", "allwinner,sun50i-h616";
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdca>;
+};
+
 &emac0 {
 	allwinner,rx-delay-ps = <3100>;
 	allwinner,tx-delay-ps = <700>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts
index 959b6fd18483..26d25b5b59e0 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts
@@ -6,6 +6,7 @@
 /dts-v1/;
 
 #include "sun50i-h616.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
@@ -32,6 +33,10 @@
 	};
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdca>;
+};
+
 &ehci0 {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index f8ecd7db4868..921d5f61d8d6 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -26,6 +26,7 @@
 			reg = <0>;
 			enable-method = "psci";
 			clocks = <&ccu CLK_CPUX>;
+			#cooling-cells = <2>;
 		};
 
 		cpu1: cpu@1 {
@@ -34,6 +35,7 @@
 			reg = <1>;
 			enable-method = "psci";
 			clocks = <&ccu CLK_CPUX>;
+			#cooling-cells = <2>;
 		};
 
 		cpu2: cpu@2 {
@@ -42,6 +44,7 @@
 			reg = <2>;
 			enable-method = "psci";
 			clocks = <&ccu CLK_CPUX>;
+			#cooling-cells = <2>;
 		};
 
 		cpu3: cpu@3 {
@@ -50,6 +53,7 @@
 			reg = <3>;
 			enable-method = "psci";
 			clocks = <&ccu CLK_CPUX>;
+			#cooling-cells = <2>;
 		};
 	};
 
@@ -156,6 +160,10 @@
 			ths_calibration: thermal-sensor-calibration@14 {
 				reg = <0x14 0x8>;
 			};
+
+			cpu_speed_grade: cpu-speed-grade@0 {
+				reg = <0x0 2>;
+			};
 		};
 
 		watchdog: watchdog@30090a0 {
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
index 8c1263a3939e..e92d150aaf1c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi
@@ -4,6 +4,11 @@
  */
 
 #include "sun50i-h616.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
+
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
 
 &mmc2 {
 	pinctrl-names = "default";
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts
index 21ca1977055d..6a4f0da97233 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts
@@ -6,6 +6,7 @@
 /dts-v1/;
 
 #include "sun50i-h616.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
@@ -53,6 +54,10 @@
 	};
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
 &ehci1 {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts
index b3b1b8692125..e1cd7572a14c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts
@@ -6,12 +6,17 @@
 /dts-v1/;
 
 #include "sun50i-h616-orangepi-zero.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 / {
 	model = "OrangePi Zero3";
 	compatible = "xunlong,orangepi-zero3", "allwinner,sun50i-h618";
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
 &emac0 {
 	allwinner,tx-delay-ps = <700>;
 	phy-mode = "rgmii-rxid";
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
index a1d0cac4d244..d6631bfe629f 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts
@@ -6,6 +6,7 @@
 /dts-v1/;
 
 #include "sun50i-h616.dtsi"
+#include "sun50i-h616-cpu-opp.dtsi"
 
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
@@ -51,6 +52,10 @@
 	};
 };
 
+&cpu0 {
+	cpu-supply = <&reg_dcdc2>;
+};
+
 &ehci0 {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi
index 4b48e4ed2d28..dcd0656d67a8 100644
--- a/arch/arm64/boot/dts/st/stm32mp251.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi
@@ -443,6 +443,99 @@
 				<&clk_dsi_txbyte>;
 		};
 
+		exti1: interrupt-controller@44220000 {
+			compatible = "st,stm32mp1-exti", "syscon";
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			reg = <0x44220000 0x400>;
+			interrupts-extended =
+				<&intc GIC_SPI 268 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_0 */
+				<&intc GIC_SPI 269 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 270 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 271 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 273 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 274 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 275 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 276 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 277 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 278 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_10 */
+				<&intc GIC_SPI 279 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 280 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 281 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 282 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 283 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 0   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 1   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 260 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 259 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,						/* EXTI_20 */
+				<&intc GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 181 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_30 */
+				<&intc GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 149 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 150 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 152 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_40 */
+				<&intc GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 169 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 182 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 209 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 229 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 215 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 210 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_50 */
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 171 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,						/* EXTI_60 */
+				<&intc GIC_SPI 173 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 220 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 10  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_70 */
+				<0>,
+				<&intc GIC_SPI 224 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 202 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 253 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 254 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 255 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,						/* EXTI_80 */
+				<0>,
+				<0>,
+				<&intc GIC_SPI 257 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 258 IRQ_TYPE_LEVEL_HIGH>;
+		};
+
 		syscfg: syscon@44230000 {
 			compatible = "st,stm32mp25-syscfg", "syscon";
 			reg = <0x44230000 0x10000>;
@@ -453,6 +546,8 @@
 			#size-cells = <1>;
 			compatible = "st,stm32mp257-pinctrl";
 			ranges = <0 0x44240000 0xa0400>;
+			interrupt-parent = <&exti1>;
+			st,syscfg = <&exti1 0x60 0xff>;
 			pins-are-numbered;
 
 			gpioa: gpio@44240000 {
@@ -582,6 +677,8 @@
 			#size-cells = <1>;
 			compatible = "st,stm32mp257-z-pinctrl";
 			ranges = <0 0x46200000 0x400>;
+			interrupt-parent = <&exti1>;
+			st,syscfg = <&exti1 0x60 0xff>;
 			pins-are-numbered;
 
 			gpioz: gpio@46200000 {
@@ -597,5 +694,84 @@
 			};
 
 		};
+
+		exti2: interrupt-controller@46230000 {
+			compatible = "st,stm32mp1-exti", "syscon";
+			interrupt-controller;
+			#interrupt-cells = <2>;
+			reg = <0x46230000 0x400>;
+			interrupts-extended =
+				<&intc GIC_SPI 17  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_0 */
+				<&intc GIC_SPI 18  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 19  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 20  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 21  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 22  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 23  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 24  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 25  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 26  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 27  IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_10 */
+				<&intc GIC_SPI 28  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 29  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 30  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 31  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 32  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 12  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 13  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_20 */
+				<&intc GIC_SPI 14  IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 15  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 212 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 151 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 216 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_30 */
+				<&intc GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 207 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 175 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 177 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_40 */
+				<0>,
+				<0>,
+				<&intc GIC_SPI 200 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 11  IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 5   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 4   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 6   IRQ_TYPE_LEVEL_HIGH>,	/* EXTI_50 */
+				<&intc GIC_SPI 7   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 2   IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 3   IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,
+				<0>,						/* EXTI_60 */
+				<&intc GIC_SPI 221 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<&intc GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 248 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 249 IRQ_TYPE_LEVEL_HIGH>,
+				<&intc GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
+				<0>,
+				<0>,
+				<&intc GIC_SPI 213 IRQ_TYPE_LEVEL_HIGH>;	/* EXTI_70 */
+		};
 	};
 };
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ab8b396428da..bc0b0d75acef 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -50,16 +50,12 @@
 	msr	daif, \flags
 	.endm
 
-	.macro	enable_dbg
-	msr	daifclr, #8
-	.endm
-
 	.macro	disable_step_tsk, flgs, tmp
 	tbz	\flgs, #TIF_SINGLESTEP, 9990f
 	mrs	\tmp, mdscr_el1
 	bic	\tmp, \tmp, #DBG_MDSCR_SS
 	msr	mdscr_el1, \tmp
-	isb	// Synchronise with enable_dbg
+	isb	// Take effect before a subsequent clear of DAIF.D
 9990:
 	.endm
 
@@ -480,9 +476,10 @@ alternative_endif
  */
 	.macro	reset_pmuserenr_el0, tmpreg
 	mrs	\tmpreg, id_aa64dfr0_el1
-	sbfx	\tmpreg, \tmpreg, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
-	cmp	\tmpreg, #1			// Skip if no PMU present
-	b.lt	9000f
+	ubfx	\tmpreg, \tmpreg, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
+	cmp	\tmpreg, #ID_AA64DFR0_EL1_PMUVer_NI
+	ccmp	\tmpreg, #ID_AA64DFR0_EL1_PMUVer_IMP_DEF, #4, ne
+	b.eq	9000f				// Skip if no PMU present or IMP_DEF
 	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
 9000:
 	.endm
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 52f076afeb96..936389e9aecb 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -86,6 +86,7 @@
 #define ARM_CPU_PART_CORTEX_X2		0xD48
 #define ARM_CPU_PART_NEOVERSE_N2	0xD49
 #define ARM_CPU_PART_CORTEX_A78C	0xD4B
+#define ARM_CPU_PART_NEOVERSE_V2	0xD4F
 
 #define APM_CPU_PART_XGENE		0x000
 #define APM_CPU_VAR_POTENZA		0x00
@@ -159,6 +160,7 @@
 #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2)
 #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2)
 #define MIDR_CORTEX_A78C	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C)
+#define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index b7afaa026842..e4546b29dd0c 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -59,13 +59,14 @@
 
 .macro __init_el2_debug
 	mrs	x1, id_aa64dfr0_el1
-	sbfx	x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
-	cmp	x0, #1
-	b.lt	.Lskip_pmu_\@			// Skip if no PMU present
+	ubfx	x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
+	cmp	x0, #ID_AA64DFR0_EL1_PMUVer_NI
+	ccmp	x0, #ID_AA64DFR0_EL1_PMUVer_IMP_DEF, #4, ne
+	b.eq	.Lskip_pmu_\@			// Skip if no PMU present or IMP_DEF
 	mrs	x0, pmcr_el0			// Disable debug access traps
 	ubfx	x0, x0, #11, #5			// to EL2 and allow access to
 .Lskip_pmu_\@:
-	csel	x2, xzr, x0, lt			// all PMU counters from EL1
+	csel	x2, xzr, x0, eq			// all PMU counters from EL1
 
 	/* Statistical profiling */
 	ubfx	x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index db1aeacd4cd9..8c0a36f72d6f 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -135,6 +135,12 @@ enum aarch64_insn_special_register {
 	AARCH64_INSN_SPCLREG_SP_EL2	= 0xF210
 };
 
+enum aarch64_insn_system_register {
+	AARCH64_INSN_SYSREG_TPIDR_EL1	= 0x4684,
+	AARCH64_INSN_SYSREG_TPIDR_EL2	= 0x6682,
+	AARCH64_INSN_SYSREG_SP_EL0	= 0x4208,
+};
+
 enum aarch64_insn_variant {
 	AARCH64_INSN_VARIANT_32BIT,
 	AARCH64_INSN_VARIANT_64BIT
@@ -686,6 +692,8 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
 }
 #endif
 u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type);
+u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
+			 enum aarch64_insn_system_register sysreg);
 
 s32 aarch64_get_branch_offset(u32 insn);
 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index 0a7186a93882..d4d7451c2c12 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -5,7 +5,6 @@
 #ifndef __ASM_IRQFLAGS_H
 #define __ASM_IRQFLAGS_H
 
-#include <asm/alternative.h>
 #include <asm/barrier.h>
 #include <asm/ptrace.h>
 #include <asm/sysreg.h>
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index 6aafbb789991..4e753908b801 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -15,17 +15,23 @@
 
 #define JUMP_LABEL_NOP_SIZE		AARCH64_INSN_SIZE
 
+#define JUMP_TABLE_ENTRY(key, label)			\
+	".pushsection	__jump_table, \"aw\"\n\t"	\
+	".align		3\n\t"				\
+	".long		1b - ., %l["#label"] - .\n\t"	\
+	".quad		%c0 - .\n\t"			\
+	".popsection\n\t"				\
+	:  :  "i"(key) :  : label
+
 static __always_inline bool arch_static_branch(struct static_key * const key,
 					       const bool branch)
 {
+	char *k = &((char *)key)[branch];
+
 	asm goto(
 		"1:	nop					\n\t"
-		 "	.pushsection	__jump_table, \"aw\"	\n\t"
-		 "	.align		3			\n\t"
-		 "	.long		1b - ., %l[l_yes] - .	\n\t"
-		 "	.quad		%c0 - .			\n\t"
-		 "	.popsection				\n\t"
-		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
+		JUMP_TABLE_ENTRY(k, l_yes)
+		);
 
 	return false;
 l_yes:
@@ -35,15 +41,11 @@ l_yes:
 static __always_inline bool arch_static_branch_jump(struct static_key * const key,
 						    const bool branch)
 {
+	char *k = &((char *)key)[branch];
 	asm goto(
 		"1:	b		%l[l_yes]		\n\t"
-		 "	.pushsection	__jump_table, \"aw\"	\n\t"
-		 "	.align		3			\n\t"
-		 "	.long		1b - ., %l[l_yes] - .	\n\t"
-		 "	.quad		%c0 - .			\n\t"
-		 "	.popsection				\n\t"
-		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
-
+		JUMP_TABLE_ENTRY(k, l_yes)
+		);
 	return false;
 l_yes:
 	return true;
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index dd9ee67d1d87..b11cfb9fdd37 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -18,14 +18,21 @@
 #define PTE_DIRTY		(_AT(pteval_t, 1) << 55)
 #define PTE_SPECIAL		(_AT(pteval_t, 1) << 56)
 #define PTE_DEVMAP		(_AT(pteval_t, 1) << 57)
-#define PTE_PROT_NONE		(_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
 
 /*
- * This bit indicates that the entry is present i.e. pmd_page()
- * still points to a valid huge page in memory even if the pmd
- * has been invalidated.
+ * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
+ * interpreted according to the HW layout by SW but any attempted HW access to
+ * the address will result in a fault. pte_present() returns true.
  */
-#define PMD_PRESENT_INVALID	(_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */
+#define PTE_PRESENT_INVALID	(PTE_NG)		 /* only when !PTE_VALID */
+
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define PTE_UFFD_WP		(_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
+#define PTE_SWP_UFFD_WP		(_AT(pteval_t, 1) << 3)	 /* only for swp ptes */
+#else
+#define PTE_UFFD_WP		(_AT(pteval_t, 0))
+#define PTE_SWP_UFFD_WP		(_AT(pteval_t, 0))
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
 
 #define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
 #define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
@@ -103,7 +110,7 @@ static inline bool __pure lpa2_is_enabled(void)
 		__val;							\
 	 })
 
-#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
+#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
 #define PAGE_SHARED		__pgprot(_PAGE_SHARED)
 #define PAGE_SHARED_EXEC	__pgprot(_PAGE_SHARED_EXEC)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index afdd56d26ad7..bde9fd179388 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -105,7 +105,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 /*
  * The following only work if pte_present(). Undefined behaviour otherwise.
  */
-#define pte_present(pte)	(!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)))
+#define pte_present(pte)	(pte_valid(pte) || pte_present_invalid(pte))
 #define pte_young(pte)		(!!(pte_val(pte) & PTE_AF))
 #define pte_special(pte)	(!!(pte_val(pte) & PTE_SPECIAL))
 #define pte_write(pte)		(!!(pte_val(pte) & PTE_WRITE))
@@ -132,6 +132,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 #define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))
 
 #define pte_valid(pte)		(!!(pte_val(pte) & PTE_VALID))
+#define pte_present_invalid(pte) \
+	((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID)
 /*
  * Execute-only user mappings do not have the PTE_USER bit set. All valid
  * kernel mappings have the PTE_UXN bit set.
@@ -261,6 +263,13 @@ static inline pte_t pte_mkpresent(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_VALID));
 }
 
+static inline pte_t pte_mkinvalid(pte_t pte)
+{
+	pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
+	pte = clear_pte_bit(pte, __pgprot(PTE_VALID));
+	return pte;
+}
+
 static inline pmd_t pmd_mkcont(pmd_t pmd)
 {
 	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
@@ -271,9 +280,31 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
 }
 
-static inline void __set_pte(pte_t *ptep, pte_t pte)
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+	return !!(pte_val(pte) & PTE_UFFD_WP);
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+	return pte_wrprotect(set_pte_bit(pte, __pgprot(PTE_UFFD_WP)));
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(PTE_UFFD_WP));
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+static inline void __set_pte_nosync(pte_t *ptep, pte_t pte)
 {
 	WRITE_ONCE(*ptep, pte);
+}
+
+static inline void __set_pte(pte_t *ptep, pte_t pte)
+{
+	__set_pte_nosync(ptep, pte);
 
 	/*
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
@@ -463,13 +494,39 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 	return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+	return !!(pte_val(pte) & PTE_SWP_UFFD_WP);
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * See the comment in include/linux/pgtable.h
  */
 static inline int pte_protnone(pte_t pte)
 {
-	return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
+	/*
+	 * pte_present_invalid() tells us that the pte is invalid from HW
+	 * perspective but present from SW perspective, so the fields are to be
+	 * interpretted as per the HW layout. The second 2 checks are the unique
+	 * encoding that we use for PROT_NONE. It is insufficient to only use
+	 * the first check because we share the same encoding scheme with pmds
+	 * which support pmd_mkinvalid(), so can be present-invalid without
+	 * being PROT_NONE.
+	 */
+	return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte);
 }
 
 static inline int pmd_protnone(pmd_t pmd)
@@ -478,12 +535,7 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif
 
-#define pmd_present_invalid(pmd)     (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))
-
-static inline int pmd_present(pmd_t pmd)
-{
-	return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
-}
+#define pmd_present(pmd)	pte_present(pmd_pte(pmd))
 
 /*
  * THP definitions.
@@ -508,14 +560,16 @@ static inline int pmd_trans_huge(pmd_t pmd)
 #define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-
-static inline pmd_t pmd_mkinvalid(pmd_t pmd)
-{
-	pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
-	pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));
-
-	return pmd;
-}
+#define pmd_mkinvalid(pmd)	pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define pmd_uffd_wp(pmd)	pte_uffd_wp(pmd_pte(pmd))
+#define pmd_mkuffd_wp(pmd)	pte_pmd(pte_mkuffd_wp(pmd_pte(pmd)))
+#define pmd_clear_uffd_wp(pmd)	pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd)))
+#define pmd_swp_uffd_wp(pmd)	pte_swp_uffd_wp(pmd_pte(pmd))
+#define pmd_swp_mkuffd_wp(pmd)	pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd)))
+#define pmd_swp_clear_uffd_wp(pmd) \
+				pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd)))
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
 
 #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
 
@@ -760,6 +814,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)
 
 #else
 
+#define pud_valid(pud)		false
 #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
 #define pud_user_exec(pud)	pud_user(pud) /* Always 0 with folding */
 
@@ -1005,6 +1060,8 @@ static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr)
 
 static inline bool pgtable_l5_enabled(void) { return false; }
 
+#define p4d_index(addr)		(((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))
+
 /* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */
 #define p4d_set_fixmap(addr)		NULL
 #define p4d_set_fixmap_offset(p4dp, addr)	((p4d_t *)p4dp)
@@ -1027,8 +1084,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	 * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
 	 */
 	const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
-			      PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP |
-			      PTE_ATTRINDX_MASK;
+			      PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE |
+			      PTE_GP | PTE_ATTRINDX_MASK;
 	/* preserve the hardware dirty information */
 	if (pte_hw_dirty(pte))
 		pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
@@ -1076,17 +1133,17 @@ static inline int pgd_devmap(pgd_t pgd)
 #ifdef CONFIG_PAGE_TABLE_CHECK
 static inline bool pte_user_accessible_page(pte_t pte)
 {
-	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
+	return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte));
 }
 
 static inline bool pmd_user_accessible_page(pmd_t pmd)
 {
-	return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
+	return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
 }
 
 static inline bool pud_user_accessible_page(pud_t pud)
 {
-	return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud));
+	return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud));
 }
 #endif
 
@@ -1248,15 +1305,16 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
  * Encode and decode a swap entry:
  *	bits 0-1:	present (must be zero)
  *	bits 2:		remember PG_anon_exclusive
- *	bits 3-7:	swap type
- *	bits 8-57:	swap offset
- *	bit  58:	PTE_PROT_NONE (must be zero)
+ *	bit  3:		remember uffd-wp state
+ *	bits 6-10:	swap type
+ *	bit  11:	PTE_PRESENT_INVALID (must be zero)
+ *	bits 12-61:	swap offset
  */
-#define __SWP_TYPE_SHIFT	3
+#define __SWP_TYPE_SHIFT	6
 #define __SWP_TYPE_BITS		5
-#define __SWP_OFFSET_BITS	50
 #define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
-#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
+#define __SWP_OFFSET_SHIFT	12
+#define __SWP_OFFSET_BITS	50
 #define __SWP_OFFSET_MASK	((1UL << __SWP_OFFSET_BITS) - 1)
 
 #define __swp_type(x)		(((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9e8999592f3a..af3b206fa423 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1036,18 +1036,18 @@
  * Permission Indirection Extension (PIE) permission encodings.
  * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension).
  */
-#define PIE_NONE_O	0x0
-#define PIE_R_O		0x1
-#define PIE_X_O		0x2
-#define PIE_RX_O	0x3
-#define PIE_RW_O	0x5
-#define PIE_RWnX_O	0x6
-#define PIE_RWX_O	0x7
-#define PIE_R		0x8
-#define PIE_GCS		0x9
-#define PIE_RX		0xa
-#define PIE_RW		0xc
-#define PIE_RWX		0xe
+#define PIE_NONE_O	UL(0x0)
+#define PIE_R_O		UL(0x1)
+#define PIE_X_O		UL(0x2)
+#define PIE_RX_O	UL(0x3)
+#define PIE_RW_O	UL(0x5)
+#define PIE_RWnX_O	UL(0x6)
+#define PIE_RWX_O	UL(0x7)
+#define PIE_R		UL(0x8)
+#define PIE_GCS		UL(0x9)
+#define PIE_RX		UL(0xa)
+#define PIE_RW		UL(0xc)
+#define PIE_RWX		UL(0xe)
 
 #define PIRx_ELx_PERM(idx, perm)	((perm) << ((idx) * 4))
 
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index a75de2665d84..95fbc8c05607 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -142,17 +142,24 @@ static inline unsigned long get_trans_granule(void)
  * EL1, Inner Shareable".
  *
  */
-#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl)			\
-	({									\
-		unsigned long __ta = (baddr);					\
-		unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0;		\
-		__ta &= GENMASK_ULL(36, 0);					\
-		__ta |= __ttl << 37;						\
-		__ta |= (unsigned long)(num) << 39;				\
-		__ta |= (unsigned long)(scale) << 44;				\
-		__ta |= get_trans_granule() << 46;				\
-		__ta |= (unsigned long)(asid) << 48;				\
-		__ta;								\
+#define TLBIR_ASID_MASK		GENMASK_ULL(63, 48)
+#define TLBIR_TG_MASK		GENMASK_ULL(47, 46)
+#define TLBIR_SCALE_MASK	GENMASK_ULL(45, 44)
+#define TLBIR_NUM_MASK		GENMASK_ULL(43, 39)
+#define TLBIR_TTL_MASK		GENMASK_ULL(38, 37)
+#define TLBIR_BADDR_MASK	GENMASK_ULL(36,  0)
+
+#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl)		\
+	({								\
+		unsigned long __ta = 0;					\
+		unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0;	\
+		__ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr);		\
+		__ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl);		\
+		__ta |= FIELD_PREP(TLBIR_NUM_MASK, num);		\
+		__ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale);		\
+		__ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());	\
+		__ta |= FIELD_PREP(TLBIR_ASID_MASK, asid);		\
+		__ta;							\
 	})
 
 /* These macros are used by the TLBI RANGE feature. */
@@ -439,11 +446,11 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 	 * When not uses TLB range ops, we can handle up to
 	 * (MAX_DVM_OPS - 1) pages;
 	 * When uses TLB range ops, we can handle up to
-	 * (MAX_TLBI_RANGE_PAGES - 1) pages.
+	 * MAX_TLBI_RANGE_PAGES pages.
 	 */
 	if ((!system_supports_tlb_range() &&
 	     (end - start) >= (MAX_DVM_OPS * stride)) ||
-	    pages >= MAX_TLBI_RANGE_PAGES) {
+	    pages > MAX_TLBI_RANGE_PAGES) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index dba8fcec7f33..e0e7b93c16cc 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -26,6 +26,7 @@
 #include <linux/libfdt.h>
 #include <linux/smp.h>
 #include <linux/serial_core.h>
+#include <linux/suspend.h>
 #include <linux/pgtable.h>
 
 #include <acpi/ghes.h>
@@ -227,6 +228,15 @@ done:
 		if (earlycon_acpi_spcr_enable)
 			early_init_dt_scan_chosen_stdout();
 	} else {
+#ifdef CONFIG_HIBERNATION
+		struct acpi_table_header *facs = NULL;
+		acpi_get_table(ACPI_SIG_FACS, 1, &facs);
+		if (facs) {
+			swsusp_hardware_signature =
+				((struct acpi_table_facs *)facs)->hardware_signature;
+			acpi_put_table(facs);
+		}
+#endif
 		acpi_parse_spcr(earlycon_acpi_spcr_enable, true);
 		if (IS_ENABLED(CONFIG_ACPI_BGRT))
 			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 6d157f32187b..e8ed5673f481 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -10,94 +10,12 @@
 
 #include <asm/pointer_auth.h>
 
-struct frame_tail {
-	struct frame_tail	__user *fp;
-	unsigned long		lr;
-} __attribute__((packed));
-
-/*
- * Get the return address for a single stackframe and return a pointer to the
- * next frame tail.
- */
-static struct frame_tail __user *
-user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry_ctx *entry)
-{
-	struct frame_tail buftail;
-	unsigned long err;
-	unsigned long lr;
-
-	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(tail, sizeof(buftail)))
-		return NULL;
-
-	pagefault_disable();
-	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
-	pagefault_enable();
-
-	if (err)
-		return NULL;
-
-	lr = ptrauth_strip_user_insn_pac(buftail.lr);
-
-	perf_callchain_store(entry, lr);
-
-	/*
-	 * Frame pointers should strictly progress back up the stack
-	 * (towards higher addresses).
-	 */
-	if (tail >= buftail.fp)
-		return NULL;
-
-	return buftail.fp;
-}
-
-#ifdef CONFIG_COMPAT
-/*
- * The registers we're interested in are at the end of the variable
- * length saved register structure. The fp points at the end of this
- * structure so the address of this struct is:
- * (struct compat_frame_tail *)(xxx->fp)-1
- *
- * This code has been adapted from the ARM OProfile support.
- */
-struct compat_frame_tail {
-	compat_uptr_t	fp; /* a (struct compat_frame_tail *) in compat mode */
-	u32		sp;
-	u32		lr;
-} __attribute__((packed));
-
-static struct compat_frame_tail __user *
-compat_user_backtrace(struct compat_frame_tail __user *tail,
-		      struct perf_callchain_entry_ctx *entry)
+static bool callchain_trace(void *data, unsigned long pc)
 {
-	struct compat_frame_tail buftail;
-	unsigned long err;
-
-	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(tail, sizeof(buftail)))
-		return NULL;
-
-	pagefault_disable();
-	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
-	pagefault_enable();
-
-	if (err)
-		return NULL;
-
-	perf_callchain_store(entry, buftail.lr);
-
-	/*
-	 * Frame pointers should strictly progress back up the stack
-	 * (towards higher addresses).
-	 */
-	if (tail + 1 >= (struct compat_frame_tail __user *)
-			compat_ptr(buftail.fp))
-		return NULL;
+	struct perf_callchain_entry_ctx *entry = data;
 
-	return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1;
+	return perf_callchain_store(entry, pc) == 0;
 }
-#endif /* CONFIG_COMPAT */
 
 void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
@@ -107,35 +25,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		return;
 	}
 
-	perf_callchain_store(entry, regs->pc);
-
-	if (!compat_user_mode(regs)) {
-		/* AARCH64 mode */
-		struct frame_tail __user *tail;
-
-		tail = (struct frame_tail __user *)regs->regs[29];
-
-		while (entry->nr < entry->max_stack &&
-		       tail && !((unsigned long)tail & 0x7))
-			tail = user_backtrace(tail, entry);
-	} else {
-#ifdef CONFIG_COMPAT
-		/* AARCH32 compat mode */
-		struct compat_frame_tail __user *tail;
-
-		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
-
-		while ((entry->nr < entry->max_stack) &&
-			tail && !((unsigned long)tail & 0x3))
-			tail = compat_user_backtrace(tail, entry);
-#endif
-	}
-}
-
-static bool callchain_trace(void *data, unsigned long pc)
-{
-	struct perf_callchain_entry_ctx *entry = data;
-	return perf_callchain_store(entry, pc) == 0;
+	arch_stack_walk_user(callchain_trace, entry, regs);
 }
 
 void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index aad399796e81..48c1aa456af9 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -108,6 +108,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = {
 	.override	= &id_aa64pfr0_override,
 	.fields		= {
 	        FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter),
+		FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL),
 		{}
 	},
 };
@@ -223,6 +224,7 @@ static const struct {
 	{ "nokaslr",			"arm64_sw.nokaslr=1" },
 	{ "rodata=off",			"arm64_sw.rodataoff=1" },
 	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
+	{ "arm64.no32bit_el0",		"id_aa64pfr0.el0=1" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 65a052bf741f..a096e2451044 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -298,8 +298,15 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	dynamic_scs_init();
 
 	/*
-	 * Unmask SError as soon as possible after initializing earlycon so
-	 * that we can report any SErrors immediately.
+	 * The primary CPU enters the kernel with all DAIF exceptions masked.
+	 *
+	 * We must unmask Debug and SError before preemption or scheduling is
+	 * possible to ensure that these are consistently unmasked across
+	 * threads, and we want to unmask SError as soon as possible after
+	 * initializing earlycon so that we can report any SErrors immediately.
+	 *
+	 * IRQ and FIQ will be unmasked after the root irqchip has been
+	 * detected and initialized.
 	 */
 	local_daif_restore(DAIF_PROCCTX_NOIRQ);
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 4ced34f62dab..31c8b3094dd7 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -264,6 +264,13 @@ asmlinkage notrace void secondary_start_kernel(void)
 	set_cpu_online(cpu, true);
 	complete(&cpu_running);
 
+	/*
+	 * Secondary CPUs enter the kernel with all DAIF exceptions masked.
+	 *
+	 * As with setup_arch() we must unmask Debug and SError exceptions, and
+	 * as the root irqchip has already been detected and initialized we can
+	 * unmask IRQ and FIQ at the same time.
+	 */
 	local_daif_restore(DAIF_PROCCTX);
 
 	/*
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 684c26511696..6b3258860377 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -324,3 +324,123 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
 	dump_backtrace(NULL, tsk, loglvl);
 	barrier();
 }
+
+/*
+ * The struct defined for userspace stack frame in AARCH64 mode.
+ */
+struct frame_tail {
+	struct frame_tail	__user *fp;
+	unsigned long		lr;
+} __attribute__((packed));
+
+/*
+ * Get the return address for a single stackframe and return a pointer to the
+ * next frame tail.
+ */
+static struct frame_tail __user *
+unwind_user_frame(struct frame_tail __user *tail, void *cookie,
+	       stack_trace_consume_fn consume_entry)
+{
+	struct frame_tail buftail;
+	unsigned long err;
+	unsigned long lr;
+
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(tail, sizeof(buftail)))
+		return NULL;
+
+	pagefault_disable();
+	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
+	pagefault_enable();
+
+	if (err)
+		return NULL;
+
+	lr = ptrauth_strip_user_insn_pac(buftail.lr);
+
+	if (!consume_entry(cookie, lr))
+		return NULL;
+
+	/*
+	 * Frame pointers should strictly progress back up the stack
+	 * (towards higher addresses).
+	 */
+	if (tail >= buftail.fp)
+		return NULL;
+
+	return buftail.fp;
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * The registers we're interested in are at the end of the variable
+ * length saved register structure. The fp points at the end of this
+ * structure so the address of this struct is:
+ * (struct compat_frame_tail *)(xxx->fp)-1
+ *
+ * This code has been adapted from the ARM OProfile support.
+ */
+struct compat_frame_tail {
+	compat_uptr_t	fp; /* a (struct compat_frame_tail *) in compat mode */
+	u32		sp;
+	u32		lr;
+} __attribute__((packed));
+
+static struct compat_frame_tail __user *
+unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie,
+				stack_trace_consume_fn consume_entry)
+{
+	struct compat_frame_tail buftail;
+	unsigned long err;
+
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(tail, sizeof(buftail)))
+		return NULL;
+
+	pagefault_disable();
+	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
+	pagefault_enable();
+
+	if (err)
+		return NULL;
+
+	if (!consume_entry(cookie, buftail.lr))
+		return NULL;
+
+	/*
+	 * Frame pointers should strictly progress back up the stack
+	 * (towards higher addresses).
+	 */
+	if (tail + 1 >= (struct compat_frame_tail __user *)
+			compat_ptr(buftail.fp))
+		return NULL;
+
+	return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1;
+}
+#endif /* CONFIG_COMPAT */
+
+
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+					const struct pt_regs *regs)
+{
+	if (!consume_entry(cookie, regs->pc))
+		return;
+
+	if (!compat_user_mode(regs)) {
+		/* AARCH64 mode */
+		struct frame_tail __user *tail;
+
+		tail = (struct frame_tail __user *)regs->regs[29];
+		while (tail && !((unsigned long)tail & 0x7))
+			tail = unwind_user_frame(tail, cookie, consume_entry);
+	} else {
+#ifdef CONFIG_COMPAT
+		/* AARCH32 compat mode */
+		struct compat_frame_tail __user *tail;
+
+		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
+		while (tail && !((unsigned long)tail & 0x3))
+			tail = unwind_compat_user_frame(tail, cookie, consume_entry);
+#endif
+	}
+}
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index a635ab83fee3..b008a9b46a7f 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -1515,3 +1515,14 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
 
 	return insn;
 }
+
+u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
+			 enum aarch64_insn_system_register sysreg)
+{
+	u32 insn = aarch64_insn_get_mrs_value();
+
+	insn &= ~GENMASK(19, 0);
+	insn |= sysreg << 5;
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT,
+					    insn, result);
+}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 495b732d5af3..c927e9312f10 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -109,28 +109,12 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 static phys_addr_t __init early_pgtable_alloc(int shift)
 {
 	phys_addr_t phys;
-	void *ptr;
 
 	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
 					 MEMBLOCK_ALLOC_NOLEAKTRACE);
 	if (!phys)
 		panic("Failed to allocate page table page\n");
 
-	/*
-	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
-	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
-	 * any level of table.
-	 */
-	ptr = pte_set_fixmap(phys);
-
-	memset(ptr, 0, PAGE_SIZE);
-
-	/*
-	 * Implicit barriers also ensure the zeroed page is visible to the page
-	 * table walker
-	 */
-	pte_clear_fixmap();
-
 	return phys;
 }
 
@@ -172,16 +156,25 @@ bool pgattr_change_is_safe(u64 old, u64 new)
 	return ((old ^ new) & ~mask) == 0;
 }
 
-static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
-		     phys_addr_t phys, pgprot_t prot)
+static void init_clear_pgtable(void *table)
 {
-	pte_t *ptep;
+	clear_page(table);
 
-	ptep = pte_set_fixmap_offset(pmdp, addr);
+	/* Ensure the zeroing is observed by page table walks. */
+	dsb(ishst);
+}
+
+static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
+		     phys_addr_t phys, pgprot_t prot)
+{
 	do {
 		pte_t old_pte = __ptep_get(ptep);
 
-		__set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+		/*
+		 * Required barriers to make this visible to the table walker
+		 * are deferred to the end of alloc_init_cont_pte().
+		 */
+		__set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
 		/*
 		 * After the PTE entry has been populated once, we
@@ -192,8 +185,6 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
-
-	pte_clear_fixmap();
 }
 
 static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
@@ -204,6 +195,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 {
 	unsigned long next;
 	pmd_t pmd = READ_ONCE(*pmdp);
+	pte_t *ptep;
 
 	BUG_ON(pmd_sect(pmd));
 	if (pmd_none(pmd)) {
@@ -214,10 +206,14 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 			pmdval |= PMD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pte_phys = pgtable_alloc(PAGE_SHIFT);
+		ptep = pte_set_fixmap(pte_phys);
+		init_clear_pgtable(ptep);
+		ptep += pte_index(addr);
 		__pmd_populate(pmdp, pte_phys, pmdval);
-		pmd = READ_ONCE(*pmdp);
+	} else {
+		BUG_ON(pmd_bad(pmd));
+		ptep = pte_set_fixmap_offset(pmdp, addr);
 	}
-	BUG_ON(pmd_bad(pmd));
 
 	do {
 		pgprot_t __prot = prot;
@@ -229,20 +225,26 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pte(pmdp, addr, next, phys, __prot);
+		init_pte(ptep, addr, next, phys, __prot);
 
+		ptep += pte_index(next) - pte_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
+
+	/*
+	 * Note: barriers and maintenance necessary to clear the fixmap slot
+	 * ensure that all previous pgtable writes are visible to the table
+	 * walker.
+	 */
+	pte_clear_fixmap();
 }
 
-static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
+static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot,
 		     phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long next;
-	pmd_t *pmdp;
 
-	pmdp = pmd_set_fixmap_offset(pudp, addr);
 	do {
 		pmd_t old_pmd = READ_ONCE(*pmdp);
 
@@ -268,8 +270,6 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 		}
 		phys += next - addr;
 	} while (pmdp++, addr = next, addr != end);
-
-	pmd_clear_fixmap();
 }
 
 static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
@@ -279,6 +279,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 {
 	unsigned long next;
 	pud_t pud = READ_ONCE(*pudp);
+	pmd_t *pmdp;
 
 	/*
 	 * Check for initial section mappings in the pgd/pud.
@@ -292,10 +293,14 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 			pudval |= PUD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pmd_phys = pgtable_alloc(PMD_SHIFT);
+		pmdp = pmd_set_fixmap(pmd_phys);
+		init_clear_pgtable(pmdp);
+		pmdp += pmd_index(addr);
 		__pud_populate(pudp, pmd_phys, pudval);
-		pud = READ_ONCE(*pudp);
+	} else {
+		BUG_ON(pud_bad(pud));
+		pmdp = pmd_set_fixmap_offset(pudp, addr);
 	}
-	BUG_ON(pud_bad(pud));
 
 	do {
 		pgprot_t __prot = prot;
@@ -307,10 +312,13 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
+		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
 
+		pmdp += pmd_index(next) - pmd_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
+
+	pmd_clear_fixmap();
 }
 
 static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
@@ -330,12 +338,15 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			p4dval |= P4D_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pud_phys = pgtable_alloc(PUD_SHIFT);
+		pudp = pud_set_fixmap(pud_phys);
+		init_clear_pgtable(pudp);
+		pudp += pud_index(addr);
 		__p4d_populate(p4dp, pud_phys, p4dval);
-		p4d = READ_ONCE(*p4dp);
+	} else {
+		BUG_ON(p4d_bad(p4d));
+		pudp = pud_set_fixmap_offset(p4dp, addr);
 	}
-	BUG_ON(p4d_bad(p4d));
 
-	pudp = pud_set_fixmap_offset(p4dp, addr);
 	do {
 		pud_t old_pud = READ_ONCE(*pudp);
 
@@ -385,12 +396,15 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 			pgdval |= PGD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		p4d_phys = pgtable_alloc(P4D_SHIFT);
+		p4dp = p4d_set_fixmap(p4d_phys);
+		init_clear_pgtable(p4dp);
+		p4dp += p4d_index(addr);
 		__pgd_populate(pgdp, p4d_phys, pgdval);
-		pgd = READ_ONCE(*pgdp);
+	} else {
+		BUG_ON(pgd_bad(pgd));
+		p4dp = p4d_set_fixmap_offset(pgdp, addr);
 	}
-	BUG_ON(pgd_bad(pgd));
 
-	p4dp = p4d_set_fixmap_offset(pgdp, addr);
 	do {
 		p4d_t old_p4d = READ_ONCE(*p4dp);
 
@@ -457,11 +471,10 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 
 static phys_addr_t __pgd_pgtable_alloc(int shift)
 {
-	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
-	BUG_ON(!ptr);
+	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
+	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
 
-	/* Ensure the zeroed page is visible to the page table walker */
-	dsb(ishst);
+	BUG_ON(!ptr);
 	return __pa(ptr);
 }
 
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9d40f3ffd8d2..f4bc6c5bac06 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -135,14 +135,6 @@ SYM_FUNC_START(cpu_do_resume)
 
 	msr	tcr_el1, x8
 	msr	vbar_el1, x9
-
-	/*
-	 * __cpu_setup() cleared MDSCR_EL1.MDE and friends, before unmasking
-	 * debug exceptions. By restoring MDSCR_EL1 here, we may take a debug
-	 * exception. Mask them until local_daif_restore() in cpu_suspend()
-	 * resets them.
-	 */
-	disable_daif
 	msr	mdscr_el1, x10
 
 	msr	sctlr_el1, x12
@@ -466,8 +458,6 @@ SYM_FUNC_START(__cpu_setup)
 	msr	cpacr_el1, xzr			// Reset cpacr_el1
 	mov	x1, #1 << 12			// Reset mdscr_el1 and disable
 	msr	mdscr_el1, x1			// access to the DCC from EL0
-	isb					// Unmask debug exceptions now,
-	enable_dbg				// since this is per-cpu
 	reset_pmuserenr_el0 x1			// Disable PMU access from EL0
 	reset_amuserenr_el0 x1			// Disable AMU access from EL0
 
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index 23b1b34db088..b22ab2f97a30 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -297,4 +297,12 @@
 #define A64_ADR(Rd, offset) \
 	aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR)
 
+/* MRS */
+#define A64_MRS_TPIDR_EL1(Rt) \
+	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1)
+#define A64_MRS_TPIDR_EL2(Rt) \
+	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2)
+#define A64_MRS_SP_EL0(Rt) \
+	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0)
+
 #endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 074b1d156223..720336d28856 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -29,6 +29,7 @@
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
 #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
 #define FP_BOTTOM (MAX_BPF_JIT_REG + 4)
+#define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
 
 #define check_imm(bits, imm) do {				\
 	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
@@ -67,6 +68,8 @@ static const int bpf2a64[] = {
 	/* temporary register for blinding constants */
 	[BPF_REG_AX] = A64_R(9),
 	[FP_BOTTOM] = A64_R(27),
+	/* callee saved register for kern_vm_start address */
+	[ARENA_VM_START] = A64_R(28),
 };
 
 struct jit_ctx {
@@ -79,6 +82,7 @@ struct jit_ctx {
 	__le32 *ro_image;
 	u32 stack_size;
 	int fpb_offset;
+	u64 user_vm_start;
 };
 
 struct bpf_plt {
@@ -295,7 +299,7 @@ static bool is_lsi_offset(int offset, int scale)
 #define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 8)
 
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf,
-			  bool is_exception_cb)
+			  bool is_exception_cb, u64 arena_vm_start)
 {
 	const struct bpf_prog *prog = ctx->prog;
 	const bool is_main_prog = !bpf_is_subprog(prog);
@@ -306,6 +310,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf,
 	const u8 fp = bpf2a64[BPF_REG_FP];
 	const u8 tcc = bpf2a64[TCALL_CNT];
 	const u8 fpb = bpf2a64[FP_BOTTOM];
+	const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
 	const int idx0 = ctx->idx;
 	int cur_offset;
 
@@ -411,6 +416,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf,
 
 	/* Set up function call stack */
 	emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
+
+	if (arena_vm_start)
+		emit_a64_mov_i64(arena_vm_base, arena_vm_start, ctx);
+
 	return 0;
 }
 
@@ -485,20 +494,26 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
 static int emit_lse_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx)
 {
 	const u8 code = insn->code;
+	const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
 	const u8 dst = bpf2a64[insn->dst_reg];
 	const u8 src = bpf2a64[insn->src_reg];
 	const u8 tmp = bpf2a64[TMP_REG_1];
 	const u8 tmp2 = bpf2a64[TMP_REG_2];
 	const bool isdw = BPF_SIZE(code) == BPF_DW;
+	const bool arena = BPF_MODE(code) == BPF_PROBE_ATOMIC;
 	const s16 off = insn->off;
-	u8 reg;
+	u8 reg = dst;
 
-	if (!off) {
-		reg = dst;
-	} else {
-		emit_a64_mov_i(1, tmp, off, ctx);
-		emit(A64_ADD(1, tmp, tmp, dst), ctx);
-		reg = tmp;
+	if (off || arena) {
+		if (off) {
+			emit_a64_mov_i(1, tmp, off, ctx);
+			emit(A64_ADD(1, tmp, tmp, dst), ctx);
+			reg = tmp;
+		}
+		if (arena) {
+			emit(A64_ADD(1, tmp, reg, arena_vm_base), ctx);
+			reg = tmp;
+		}
 	}
 
 	switch (insn->imm) {
@@ -567,6 +582,12 @@ static int emit_ll_sc_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	u8 reg;
 	s32 jmp_offset;
 
+	if (BPF_MODE(code) == BPF_PROBE_ATOMIC) {
+		/* ll_sc based atomics don't support unsafe pointers yet. */
+		pr_err_once("unknown atomic opcode %02x\n", code);
+		return -EINVAL;
+	}
+
 	if (!off) {
 		reg = dst;
 	} else {
@@ -738,6 +759,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb)
 
 #define BPF_FIXUP_OFFSET_MASK	GENMASK(26, 0)
 #define BPF_FIXUP_REG_MASK	GENMASK(31, 27)
+#define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */
 
 bool ex_handler_bpf(const struct exception_table_entry *ex,
 		    struct pt_regs *regs)
@@ -745,7 +767,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
 	off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
 	int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
 
-	regs->regs[dst_reg] = 0;
+	if (dst_reg != DONT_CLEAR)
+		regs->regs[dst_reg] = 0;
 	regs->pc = (unsigned long)&ex->fixup - offset;
 	return true;
 }
@@ -765,7 +788,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
 		return 0;
 
 	if (BPF_MODE(insn->code) != BPF_PROBE_MEM &&
-		BPF_MODE(insn->code) != BPF_PROBE_MEMSX)
+		BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
+			BPF_MODE(insn->code) != BPF_PROBE_MEM32 &&
+				BPF_MODE(insn->code) != BPF_PROBE_ATOMIC)
 		return 0;
 
 	if (!ctx->prog->aux->extable ||
@@ -810,6 +835,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
 
 	ex->insn = ins_offset;
 
+	if (BPF_CLASS(insn->code) != BPF_LDX)
+		dst_reg = DONT_CLEAR;
+
 	ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
 		    FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
 
@@ -829,12 +857,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		      bool extra_pass)
 {
 	const u8 code = insn->code;
-	const u8 dst = bpf2a64[insn->dst_reg];
-	const u8 src = bpf2a64[insn->src_reg];
+	u8 dst = bpf2a64[insn->dst_reg];
+	u8 src = bpf2a64[insn->src_reg];
 	const u8 tmp = bpf2a64[TMP_REG_1];
 	const u8 tmp2 = bpf2a64[TMP_REG_2];
 	const u8 fp = bpf2a64[BPF_REG_FP];
 	const u8 fpb = bpf2a64[FP_BOTTOM];
+	const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
 	const int i = insn - ctx->prog->insnsi;
@@ -853,6 +882,24 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	/* dst = src */
 	case BPF_ALU | BPF_MOV | BPF_X:
 	case BPF_ALU64 | BPF_MOV | BPF_X:
+		if (insn_is_cast_user(insn)) {
+			emit(A64_MOV(0, tmp, src), ctx); // 32-bit mov clears the upper 32 bits
+			emit_a64_mov_i(0, dst, ctx->user_vm_start >> 32, ctx);
+			emit(A64_LSL(1, dst, dst, 32), ctx);
+			emit(A64_CBZ(1, tmp, 2), ctx);
+			emit(A64_ORR(1, tmp, dst, tmp), ctx);
+			emit(A64_MOV(1, dst, tmp), ctx);
+			break;
+		} else if (insn_is_mov_percpu_addr(insn)) {
+			if (dst != src)
+				emit(A64_MOV(1, dst, src), ctx);
+			if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+				emit(A64_MRS_TPIDR_EL2(tmp), ctx);
+			else
+				emit(A64_MRS_TPIDR_EL1(tmp), ctx);
+			emit(A64_ADD(1, dst, dst, tmp), ctx);
+			break;
+		}
 		switch (insn->off) {
 		case 0:
 			emit(A64_MOV(is64, dst, src), ctx);
@@ -1181,6 +1228,21 @@ emit_cond_jmp:
 		const u8 r0 = bpf2a64[BPF_REG_0];
 		bool func_addr_fixed;
 		u64 func_addr;
+		u32 cpu_offset;
+
+		/* Implement helper call to bpf_get_smp_processor_id() inline */
+		if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) {
+			cpu_offset = offsetof(struct thread_info, cpu);
+
+			emit(A64_MRS_SP_EL0(tmp), ctx);
+			if (is_lsi_offset(cpu_offset, 2)) {
+				emit(A64_LDR32I(r0, tmp, cpu_offset), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp2, cpu_offset, ctx);
+				emit(A64_LDR32(r0, tmp, tmp2), ctx);
+			}
+			break;
+		}
 
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &func_addr, &func_addr_fixed);
@@ -1237,7 +1299,15 @@ emit_cond_jmp:
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
-		if (ctx->fpb_offset > 0 && src == fp) {
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
+		if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+			emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx);
+			src = tmp2;
+		}
+		if (ctx->fpb_offset > 0 && src == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) {
 			src_adj = fpb;
 			off_adj = off + ctx->fpb_offset;
 		} else {
@@ -1322,7 +1392,15 @@ emit_cond_jmp:
 	case BPF_ST | BPF_MEM | BPF_H:
 	case BPF_ST | BPF_MEM | BPF_B:
 	case BPF_ST | BPF_MEM | BPF_DW:
-		if (ctx->fpb_offset > 0 && dst == fp) {
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+		if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+			emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx);
+			dst = tmp2;
+		}
+		if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) {
 			dst_adj = fpb;
 			off_adj = off + ctx->fpb_offset;
 		} else {
@@ -1365,6 +1443,10 @@ emit_cond_jmp:
 			}
 			break;
 		}
+
+		ret = add_exception_handler(insn, ctx, dst);
+		if (ret)
+			return ret;
 		break;
 
 	/* STX: *(size *)(dst + off) = src */
@@ -1372,7 +1454,15 @@ emit_cond_jmp:
 	case BPF_STX | BPF_MEM | BPF_H:
 	case BPF_STX | BPF_MEM | BPF_B:
 	case BPF_STX | BPF_MEM | BPF_DW:
-		if (ctx->fpb_offset > 0 && dst == fp) {
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+		if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+			emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx);
+			dst = tmp2;
+		}
+		if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) {
 			dst_adj = fpb;
 			off_adj = off + ctx->fpb_offset;
 		} else {
@@ -1413,16 +1503,26 @@ emit_cond_jmp:
 			}
 			break;
 		}
+
+		ret = add_exception_handler(insn, ctx, dst);
+		if (ret)
+			return ret;
 		break;
 
 	case BPF_STX | BPF_ATOMIC | BPF_W:
 	case BPF_STX | BPF_ATOMIC | BPF_DW:
+	case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+	case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
 		if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS))
 			ret = emit_lse_atomic(insn, ctx);
 		else
 			ret = emit_ll_sc_atomic(insn, ctx);
 		if (ret)
 			return ret;
+
+		ret = add_exception_handler(insn, ctx, dst);
+		if (ret)
+			return ret;
 		break;
 
 	default:
@@ -1594,6 +1694,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bool tmp_blinded = false;
 	bool extra_pass = false;
 	struct jit_ctx ctx;
+	u64 arena_vm_start;
 	u8 *image_ptr;
 	u8 *ro_image_ptr;
 
@@ -1611,6 +1712,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		prog = tmp;
 	}
 
+	arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
 	jit_data = prog->aux->jit_data;
 	if (!jit_data) {
 		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
@@ -1641,6 +1743,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	ctx.fpb_offset = find_fpb_offset(prog);
+	ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
 
 	/*
 	 * 1. Initial fake pass to compute ctx->idx and ctx->offset.
@@ -1648,7 +1751,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	 * BPF line info needs ctx->offset[i] to be the offset of
 	 * instruction[i] in jited image, so build prologue first.
 	 */
-	if (build_prologue(&ctx, was_classic, prog->aux->exception_cb)) {
+	if (build_prologue(&ctx, was_classic, prog->aux->exception_cb,
+			   arena_vm_start)) {
 		prog = orig_prog;
 		goto out_off;
 	}
@@ -1696,7 +1800,7 @@ skip_init_ctx:
 	ctx.idx = 0;
 	ctx.exentry_idx = 0;
 
-	build_prologue(&ctx, was_classic, prog->aux->exception_cb);
+	build_prologue(&ctx, was_classic, prog->aux->exception_cb, arena_vm_start);
 
 	if (build_body(&ctx, extra_pass)) {
 		prog = orig_prog;
@@ -2165,12 +2269,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
 	bpf_prog_pack_free(image, size);
 }
 
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
+	return 0;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
@@ -2453,6 +2554,39 @@ bool bpf_jit_supports_exceptions(void)
 	return true;
 }
 
+bool bpf_jit_supports_arena(void)
+{
+	return true;
+}
+
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	if (!in_arena)
+		return true;
+	switch (insn->code) {
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS))
+			return false;
+	}
+	return true;
+}
+
+bool bpf_jit_supports_percpu_insn(void)
+{
+	return true;
+}
+
+bool bpf_jit_inlines_helper_call(s32 imm)
+{
+	switch (imm) {
+	case BPF_FUNC_get_smp_processor_id:
+		return true;
+	default:
+		return false;
+	}
+}
+
 void bpf_jit_free(struct bpf_prog *prog)
 {
 	if (prog->jited) {
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index e73323d759d0..7dbefd4ba210 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -1294,16 +1294,19 @@ skip_init_ctx:
 	flush_icache_range((unsigned long)header, (unsigned long)(ctx.image + ctx.idx));
 
 	if (!prog->is_func || extra_pass) {
+		int err;
+
 		if (extra_pass && ctx.idx != jit_data->ctx.idx) {
 			pr_err_once("multi-func JIT bug %d != %d\n",
 				    ctx.idx, jit_data->ctx.idx);
-			bpf_jit_binary_free(header);
-			prog->bpf_func = NULL;
-			prog->jited = 0;
-			prog->jited_len = 0;
-			goto out_offset;
+			goto out_free;
+		}
+		err = bpf_jit_binary_lock_ro(header);
+		if (err) {
+			pr_err_once("bpf_jit_binary_lock_ro() returned %d\n",
+				    err);
+			goto out_free;
 		}
-		bpf_jit_binary_lock_ro(header);
 	} else {
 		jit_data->ctx = ctx;
 		jit_data->image = image_ptr;
@@ -1334,6 +1337,13 @@ out:
 	out_offset = -1;
 
 	return prog;
+
+out_free:
+	bpf_jit_binary_free(header);
+	prog->bpf_func = NULL;
+	prog->jited = 0;
+	prog->jited_len = 0;
+	goto out_offset;
 }
 
 /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 6ffa29585194..cc26df907bfe 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,8 +3,8 @@ config M68K
 	bool
 	default y
 	select ARCH_32BIT_OFF_T
-	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_BINFMT_FLAT
+	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DMA_PREP_COHERENT if M68K_NONCOHERENT_DMA && !COLDFIRE
@@ -18,7 +18,7 @@ config M68K
 	select DMA_DIRECT_REMAP if M68K_NONCOHERENT_DMA && !COLDFIRE
 	select GENERIC_ATOMIC64
 	select GENERIC_CPU_DEVICES
-	select GENERIC_IOMAP
+	select GENERIC_IOMAP if HAS_IOPORT
 	select GENERIC_IRQ_SHOW
 	select GENERIC_LIB_ASHLDI3
 	select GENERIC_LIB_ASHRDI3
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c
index 99718f3dc686..d4b170c861bf 100644
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -836,7 +836,7 @@ static void amiga_get_hardware_list(struct seq_file *m)
 		seq_printf(m, "\tZorro II%s AutoConfig: %d Expansion "
 				"Device%s\n",
 				AMIGAHW_PRESENT(ZORRO3) ? "I" : "",
-				zorro_num_autocon, zorro_num_autocon == 1 ? "" : "s");
+				zorro_num_autocon, str_plural(zorro_num_autocon));
 #endif /* CONFIG_ZORRO */
 
 #undef AMIGAHW_ANNOUNCE
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index a6f6607efe79..6bb202d03d34 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -26,6 +26,7 @@ CONFIG_AMIGA_BUILTIN_SERIAL=y
 CONFIG_SERIAL_CONSOLE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_ATARI_PARTITION=y
 CONFIG_MAC_PARTITION=y
@@ -217,7 +218,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -624,8 +624,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 0ca1f9f930bc..3598e0cc66b0 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -21,6 +21,7 @@ CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -213,7 +214,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -581,8 +581,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index be030659d8d7..3fbb8a9a307d 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -29,6 +29,7 @@ CONFIG_ATARI_ETHERNEC=y
 CONFIG_ATARI_DSP56K=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_MAC_PARTITION=y
@@ -220,7 +221,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -601,8 +601,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index ad8f81fbb630..9a2b0c7871ce 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -19,6 +19,7 @@ CONFIG_BVME6000=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -210,7 +211,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -573,8 +573,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index ff253b6accec..2408785e0e48 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -20,6 +20,7 @@ CONFIG_HP300=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -212,7 +213,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -583,8 +583,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index f92b866620a7..6789d03b7b52 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -20,6 +20,7 @@ CONFIG_MAC=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -211,7 +212,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -600,8 +600,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index bd813beaa8a7..b57af39db3b4 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -44,6 +44,7 @@ CONFIG_AMIGA_BUILTIN_SERIAL=y
 CONFIG_SERIAL_CONSOLE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_BSD_DISKLABEL=y
 CONFIG_MINIX_SUBPARTITION=y
@@ -231,7 +232,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -686,8 +686,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 2237ee0fe433..29b70f27aedd 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -18,6 +18,7 @@ CONFIG_MVME147=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -209,7 +210,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -572,8 +572,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index afb5aa9c5012..757c582ffe84 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -19,6 +19,7 @@ CONFIG_MVME16x=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -210,7 +211,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -573,8 +573,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index e40f7a308966..f15d1009108a 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -19,6 +19,7 @@ CONFIG_HEARTBEAT=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -211,7 +212,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -590,8 +590,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 4df397c0395f..5218c1e614b5 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -16,6 +16,7 @@ CONFIG_SUN3=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -206,7 +207,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -570,8 +570,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index aa7719b3947f..acdf4eb7af28 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -16,6 +16,7 @@ CONFIG_SUN3X=y
 CONFIG_PROC_HARDWARE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_AMIGA_PARTITION=y
 CONFIG_ATARI_PARTITION=y
@@ -207,7 +208,6 @@ CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_DUP_IPV6=m
@@ -571,8 +571,6 @@ CONFIG_REED_SOLOMON_TEST=m
 CONFIG_ATOMIC64_SELFTEST=m
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_HEXDUMP=m
-CONFIG_STRING_SELFTEST=m
-CONFIG_TEST_STRING_HELPERS=m
 CONFIG_TEST_KSTRTOX=m
 CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
diff --git a/arch/m68k/include/asm/pgtable.h b/arch/m68k/include/asm/pgtable.h
index 27525c6a12fd..49fcfd734860 100644
--- a/arch/m68k/include/asm/pgtable.h
+++ b/arch/m68k/include/asm/pgtable.h
@@ -2,6 +2,8 @@
 #ifndef __M68K_PGTABLE_H
 #define __M68K_PGTABLE_H
 
+#include <asm/page.h>
+
 #ifdef __uClinux__
 #include <asm/pgtable_no.h>
 #else
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 31be2ad999ca..3e31adbddc75 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -12,14 +12,15 @@
  */
 #if PAGE_SHIFT < 13
 #ifdef CONFIG_4KSTACKS
-#define THREAD_SIZE	4096
+#define THREAD_SIZE_ORDER	0
 #else
-#define THREAD_SIZE	8192
+#define THREAD_SIZE_ORDER	1
 #endif
 #else
-#define THREAD_SIZE	PAGE_SIZE
+#define THREAD_SIZE_ORDER	0
 #endif
-#define THREAD_SIZE_ORDER	((THREAD_SIZE / PAGE_SIZE) - 1)
+
+#define THREAD_SIZE	(PAGE_SIZE << THREAD_SIZE_ORDER)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 3bcdd32a6b36..338b474910f7 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -430,7 +430,9 @@ resume:
 	movec	%a0,%dfc
 
 	/* restore status register */
-	movew	%a1@(TASK_THREAD+THREAD_SR),%sr
+	movew	%a1@(TASK_THREAD+THREAD_SR),%d0
+	oriw	#0x0700,%d0
+	movew	%d0,%sr
 
 	rts
 
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 4c8f8cbfa05f..e7f0f72c1b36 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -453,30 +453,18 @@ void mac_poweroff(void)
 
 void mac_reset(void)
 {
-	if (macintosh_config->adb_type == MAC_ADB_II &&
-	    macintosh_config->ident != MAC_MODEL_SE30) {
-		/* need ROMBASE in booter */
-		/* indeed, plus need to MAP THE ROM !! */
-
-		if (mac_bi_data.rombase == 0)
-			mac_bi_data.rombase = 0x40800000;
-
-		/* works on some */
-		rom_reset = (void *) (mac_bi_data.rombase + 0xa);
-
-		local_irq_disable();
-		rom_reset();
 #ifdef CONFIG_ADB_CUDA
-	} else if (macintosh_config->adb_type == MAC_ADB_EGRET ||
-	           macintosh_config->adb_type == MAC_ADB_CUDA) {
+	if (macintosh_config->adb_type == MAC_ADB_EGRET ||
+	    macintosh_config->adb_type == MAC_ADB_CUDA) {
 		cuda_restart();
+	} else
 #endif
 #ifdef CONFIG_ADB_PMU
-	} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
+	if (macintosh_config->adb_type == MAC_ADB_PB2) {
 		pmu_restart();
+	} else
 #endif
-	} else if (CPU_IS_030) {
-
+	if (CPU_IS_030) {
 		/* 030-specific reset routine.  The idea is general, but the
 		 * specific registers to reset are '030-specific.  Until I
 		 * have a non-030 machine, I can't test anything else.
@@ -524,6 +512,18 @@ void mac_reset(void)
 		    "jmp %/a0@\n\t" /* jump to the reset vector */
 		    ".chip 68k"
 		    : : "r" (offset), "a" (rombase) : "a0");
+	} else {
+		/* need ROMBASE in booter */
+		/* indeed, plus need to MAP THE ROM !! */
+
+		if (mac_bi_data.rombase == 0)
+			mac_bi_data.rombase = 0x40800000;
+
+		/* works on some */
+		rom_reset = (void *)(mac_bi_data.rombase + 0xa);
+
+		local_irq_disable();
+		rom_reset();
 	}
 
 	/* should never get here */
diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig
index 4da7bc4ac4a3..176314f3c9aa 100644
--- a/arch/microblaze/configs/mmu_defconfig
+++ b/arch/microblaze/configs/mmu_defconfig
@@ -4,7 +4,7 @@ CONFIG_AUDIT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_EXPERT=y
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1
 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1
diff --git a/arch/mips/configs/rs90_defconfig b/arch/mips/configs/rs90_defconfig
index 4b9e36d6400e..a53dd66e9b86 100644
--- a/arch/mips/configs/rs90_defconfig
+++ b/arch/mips/configs/rs90_defconfig
@@ -9,7 +9,7 @@ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y
 # CONFIG_SGETMASK_SYSCALL is not set
 # CONFIG_SYSFS_SYSCALL is not set
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_TIMERFD is not set
 # CONFIG_AIO is not set
 # CONFIG_IO_URING is not set
diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c
index 6c3704f51d0d..87f0a1436bf9 100644
--- a/arch/mips/dec/setup.c
+++ b/arch/mips/dec/setup.c
@@ -756,7 +756,7 @@ void __init arch_init_irq(void)
 				NULL))
 			pr_err("Failed to register fpu interrupt\n");
 		desc_fpu = irq_to_desc(irq_fpu);
-		fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs);
+		fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt);
 	}
 	if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) {
 		if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action,
diff --git a/arch/mips/net/bpf_jit_comp.c b/arch/mips/net/bpf_jit_comp.c
index a40d926b6513..e355dfca4400 100644
--- a/arch/mips/net/bpf_jit_comp.c
+++ b/arch/mips/net/bpf_jit_comp.c
@@ -1012,7 +1012,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bpf_prog_fill_jited_linfo(prog, &ctx.descriptors[1]);
 
 	/* Set as read-only exec and flush instruction cache */
-	bpf_jit_binary_lock_ro(header);
+	if (bpf_jit_binary_lock_ro(header))
+		goto out_err;
 	flush_icache_range((unsigned long)header,
 			   (unsigned long)&ctx.target[ctx.jit_index]);
 
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 3586cda55bde..69c0258700b2 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -188,6 +188,15 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config FPU
+	bool "FPU support"
+	default y
+	help
+	  Say N here if you want to disable all floating-point related procedures
+	  in the kernel and reduce binary size.
+
+	  If you don't know what to do here, say Y.
+
 source "kernel/Kconfig.hz"
 
 config OPENRISC_NO_SPR_SR_DSX
diff --git a/arch/openrisc/include/asm/fpu.h b/arch/openrisc/include/asm/fpu.h
new file mode 100644
index 000000000000..57bc44d80d53
--- /dev/null
+++ b/arch/openrisc/include/asm/fpu.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_OPENRISC_FPU_H
+#define __ASM_OPENRISC_FPU_H
+
+struct task_struct;
+
+#ifdef CONFIG_FPU
+static inline void save_fpu(struct task_struct *task)
+{
+	task->thread.fpcsr = mfspr(SPR_FPCSR);
+}
+
+static inline void restore_fpu(struct task_struct *task)
+{
+	mtspr(SPR_FPCSR, task->thread.fpcsr);
+}
+#else
+#define save_fpu(tsk)			do { } while (0)
+#define restore_fpu(tsk)		do { } while (0)
+#endif
+
+#endif /* __ASM_OPENRISC_FPU_H */
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 3b736e74e6ed..e05d1b59e24e 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -44,6 +44,7 @@
 struct task_struct;
 
 struct thread_struct {
+	long fpcsr;		/* Floating point control status register. */
 };
 
 /*
diff --git a/arch/openrisc/include/asm/ptrace.h b/arch/openrisc/include/asm/ptrace.h
index 375147ff71fc..1da3e66292e2 100644
--- a/arch/openrisc/include/asm/ptrace.h
+++ b/arch/openrisc/include/asm/ptrace.h
@@ -59,7 +59,7 @@ struct pt_regs {
 	 * -1 for all other exceptions.
 	 */
 	long  orig_gpr11;	/* For restarting system calls */
-	long fpcsr;		/* Floating point control status register. */
+	long dummy;		/* Cheap alignment fix */
 	long dummy2;		/* Cheap alignment fix */
 };
 
@@ -115,6 +115,5 @@ static inline long regs_return_value(struct pt_regs *regs)
 #define PT_GPR31      124
 #define PT_PC	      128
 #define PT_ORIG_GPR11 132
-#define PT_FPCSR      136
 
 #endif /* __ASM_OPENRISC_PTRACE_H */
diff --git a/arch/openrisc/include/uapi/asm/elf.h b/arch/openrisc/include/uapi/asm/elf.h
index 6868f81c281e..441e343f8268 100644
--- a/arch/openrisc/include/uapi/asm/elf.h
+++ b/arch/openrisc/include/uapi/asm/elf.h
@@ -34,15 +34,72 @@
 #include <asm/ptrace.h>
 
 /* The OR1K relocation types... not all relevant for module loader */
-#define R_OR32_NONE	0
-#define R_OR32_32	1
-#define R_OR32_16	2
-#define R_OR32_8	3
-#define R_OR32_CONST	4
-#define R_OR32_CONSTH	5
-#define R_OR32_JUMPTARG	6
-#define R_OR32_VTINHERIT 7
-#define R_OR32_VTENTRY	8
+#define R_OR1K_NONE		0
+#define R_OR1K_32		1
+#define R_OR1K_16		2
+#define R_OR1K_8		3
+#define R_OR1K_LO_16_IN_INSN	4
+#define R_OR1K_HI_16_IN_INSN	5
+#define R_OR1K_INSN_REL_26	6
+#define R_OR1K_GNU_VTENTRY	7
+#define R_OR1K_GNU_VTINHERIT	8
+#define R_OR1K_32_PCREL		9
+#define R_OR1K_16_PCREL		10
+#define R_OR1K_8_PCREL		11
+#define R_OR1K_GOTPC_HI16	12
+#define R_OR1K_GOTPC_LO16	13
+#define R_OR1K_GOT16		14
+#define R_OR1K_PLT26		15
+#define R_OR1K_GOTOFF_HI16	16
+#define R_OR1K_GOTOFF_LO16	17
+#define R_OR1K_COPY		18
+#define R_OR1K_GLOB_DAT		19
+#define R_OR1K_JMP_SLOT		20
+#define R_OR1K_RELATIVE		21
+#define R_OR1K_TLS_GD_HI16	22
+#define R_OR1K_TLS_GD_LO16	23
+#define R_OR1K_TLS_LDM_HI16	24
+#define R_OR1K_TLS_LDM_LO16	25
+#define R_OR1K_TLS_LDO_HI16	26
+#define R_OR1K_TLS_LDO_LO16	27
+#define R_OR1K_TLS_IE_HI16	28
+#define R_OR1K_TLS_IE_LO16	29
+#define R_OR1K_TLS_LE_HI16	30
+#define R_OR1K_TLS_LE_LO16	31
+#define R_OR1K_TLS_TPOFF	32
+#define R_OR1K_TLS_DTPOFF	33
+#define R_OR1K_TLS_DTPMOD	34
+#define R_OR1K_AHI16		35
+#define R_OR1K_GOTOFF_AHI16	36
+#define R_OR1K_TLS_IE_AHI16	37
+#define R_OR1K_TLS_LE_AHI16	38
+#define R_OR1K_SLO16		39
+#define R_OR1K_GOTOFF_SLO16	40
+#define R_OR1K_TLS_LE_SLO16	41
+#define R_OR1K_PCREL_PG21	42
+#define R_OR1K_GOT_PG21		43
+#define R_OR1K_TLS_GD_PG21	44
+#define R_OR1K_TLS_LDM_PG21	45
+#define R_OR1K_TLS_IE_PG21	46
+#define R_OR1K_LO13		47
+#define R_OR1K_GOT_LO13		48
+#define R_OR1K_TLS_GD_LO13	49
+#define R_OR1K_TLS_LDM_LO13	50
+#define R_OR1K_TLS_IE_LO13	51
+#define R_OR1K_SLO13		52
+#define R_OR1K_PLTA26		53
+#define R_OR1K_GOT_AHI16	54
+
+/* Old relocation names */
+#define R_OR32_NONE	R_OR1K_NONE
+#define R_OR32_32	R_OR1K_32
+#define R_OR32_16	R_OR1K_16
+#define R_OR32_8	R_OR1K_8
+#define R_OR32_CONST	R_OR1K_LO_16_IN_INSN
+#define R_OR32_CONSTH	R_OR1K_HI_16_IN_INSN
+#define R_OR32_JUMPTARG	R_OR1K_INSN_REL_26
+#define R_OR32_VTENTRY	R_OR1K_GNU_VTENTRY
+#define R_OR32_VTINHERIT R_OR1K_GNU_VTINHERIT
 
 typedef unsigned long elf_greg_t;
 
diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S
index c9f48e750b72..440711d7bf40 100644
--- a/arch/openrisc/kernel/entry.S
+++ b/arch/openrisc/kernel/entry.S
@@ -106,8 +106,6 @@
 	l.mtspr r0,r3,SPR_EPCR_BASE				;\
 	l.lwz   r3,PT_SR(r1)					;\
 	l.mtspr r0,r3,SPR_ESR_BASE				;\
-	l.lwz	r3,PT_FPCSR(r1)					;\
-	l.mtspr	r0,r3,SPR_FPCSR					;\
 	l.lwz   r2,PT_GPR2(r1)					;\
 	l.lwz   r3,PT_GPR3(r1)					;\
 	l.lwz   r4,PT_GPR4(r1)					;\
@@ -177,8 +175,6 @@ handler:							;\
 	/* r30 already save */					;\
 	l.sw    PT_GPR31(r1),r31					;\
 	TRACE_IRQS_OFF_ENTRY						;\
-	l.mfspr	r30,r0,SPR_FPCSR				;\
-	l.sw	PT_FPCSR(r1),r30				;\
 	/* Store -1 in orig_gpr11 for non-syscall exceptions */	;\
 	l.addi	r30,r0,-1					;\
 	l.sw	PT_ORIG_GPR11(r1),r30
@@ -219,8 +215,6 @@ handler:							;\
 	/* Store -1 in orig_gpr11 for non-syscall exceptions */	;\
 	l.addi	r30,r0,-1					;\
 	l.sw	PT_ORIG_GPR11(r1),r30				;\
-	l.mfspr	r30,r0,SPR_FPCSR				;\
-	l.sw	PT_FPCSR(r1),r30				;\
 	l.addi	r3,r1,0						;\
 	/* r4 is exception EA */				;\
 	l.addi	r5,r0,vector					;\
@@ -852,6 +846,7 @@ _syscall_badsys:
 
 EXCEPTION_ENTRY(_fpe_trap_handler)
 	CLEAR_LWA_FLAG(r3)
+
 	/* r4: EA of fault (set by EXCEPTION_HANDLE) */
 	l.jal   do_fpe_trap
 	 l.addi  r3,r1,0 /* pt_regs */
@@ -1100,10 +1095,6 @@ ENTRY(_switch)
 	l.sw    PT_GPR28(r1),r28
 	l.sw    PT_GPR30(r1),r30
 
-	/* Store the old FPU state to new pt_regs */
-	l.mfspr	r29,r0,SPR_FPCSR
-	l.sw	PT_FPCSR(r1),r29
-
 	l.addi	r11,r10,0			/* Save old 'current' to 'last' return value*/
 
 	/* We use thread_info->ksp for storing the address of the above
@@ -1126,10 +1117,6 @@ ENTRY(_switch)
 	l.lwz	r29,PT_SP(r1)
 	l.sw	TI_KSP(r10),r29
 
-	/* Restore the old value of FPCSR */
-	l.lwz	r29,PT_FPCSR(r1)
-	l.mtspr	r0,r29,SPR_FPCSR
-
 	/* ...and restore the registers, except r11 because the return value
 	 * has already been set above.
 	 */
diff --git a/arch/openrisc/kernel/module.c b/arch/openrisc/kernel/module.c
index 532013f523ac..c9ff4c4a0b29 100644
--- a/arch/openrisc/kernel/module.c
+++ b/arch/openrisc/kernel/module.c
@@ -39,22 +39,32 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		value = sym->st_value + rel[i].r_addend;
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
-		case R_OR32_32:
+		case R_OR1K_32:
 			*location = value;
 			break;
-		case R_OR32_CONST:
+		case R_OR1K_LO_16_IN_INSN:
 			*((uint16_t *)location + 1) = value;
 			break;
-		case R_OR32_CONSTH:
+		case R_OR1K_HI_16_IN_INSN:
 			*((uint16_t *)location + 1) = value >> 16;
 			break;
-		case R_OR32_JUMPTARG:
+		case R_OR1K_INSN_REL_26:
 			value -= (uint32_t)location;
 			value >>= 2;
 			value &= 0x03ffffff;
 			value |= *location & 0xfc000000;
 			*location = value;
 			break;
+		case R_OR1K_AHI16:
+			/* Adjust the operand to match with a signed LO16.  */
+			value += 0x8000;
+			*((uint16_t *)location + 1) = value >> 16;
+			break;
+		case R_OR1K_SLO16:
+			/* Split value lower 16-bits.  */
+			value = ((value & 0xf800) << 10) | (value & 0x7ff);
+			*location = (*location & ~0x3e007ff) | value;
+			break;
 		default:
 			pr_err("module %s: Unknown relocation: %u\n",
 			       me->name, ELF32_R_TYPE(rel[i].r_info));
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 86e02929f3ac..eef99fee2110 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -36,6 +36,7 @@
 #include <linux/reboot.h>
 
 #include <linux/uaccess.h>
+#include <asm/fpu.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/spr_defs.h>
@@ -65,7 +66,7 @@ void machine_restart(char *cmd)
 }
 
 /*
- * This is used if pm_power_off has not been set by a power management
+ * This is used if a sys-off handler was not set by a power management
  * driver, in this case we can assume we are on a simulator.  On
  * OpenRISC simulators l.nop 1 will trigger the simulator exit.
  */
@@ -89,10 +90,8 @@ void machine_halt(void)
 void machine_power_off(void)
 {
 	printk(KERN_INFO "*** MACHINE POWER OFF ***\n");
-	if (pm_power_off != NULL)
-		pm_power_off();
-	else
-		default_power_off();
+	do_kernel_power_off();
+	default_power_off();
 }
 
 /*
@@ -246,6 +245,8 @@ struct task_struct *__switch_to(struct task_struct *old,
 
 	local_irq_save(flags);
 
+	save_fpu(current);
+
 	/* current_set is an array of saved current pointers
 	 * (one for each cpu). we need them at user->kernel transition,
 	 * while we save them at kernel->user transition
@@ -258,6 +259,8 @@ struct task_struct *__switch_to(struct task_struct *old,
 	current_thread_info_set[smp_processor_id()] = new_ti;
 	last = (_switch(old_ti, new_ti))->task;
 
+	restore_fpu(current);
+
 	local_irq_restore(flags);
 
 	return last;
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 1eeac3b62e9d..5091b18eab4c 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -88,6 +88,7 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
+#ifdef CONFIG_FPU
 /*
  * As OpenRISC shares GPRs and floating point registers we don't need to export
  * the floating point registers again.  So here we only export the fpcsr special
@@ -97,9 +98,7 @@ static int fpregs_get(struct task_struct *target,
 		       const struct user_regset *regset,
 		       struct membuf to)
 {
-	const struct pt_regs *regs = task_pt_regs(target);
-
-	return membuf_store(&to, regs->fpcsr);
+	return membuf_store(&to, target->thread.fpcsr);
 }
 
 static int fpregs_set(struct task_struct *target,
@@ -107,21 +106,20 @@ static int fpregs_set(struct task_struct *target,
 		       unsigned int pos, unsigned int count,
 		       const void *kbuf, const void __user *ubuf)
 {
-	struct pt_regs *regs = task_pt_regs(target);
-	int ret;
-
 	/* FPCSR */
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &regs->fpcsr, 0, 4);
-	return ret;
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpcsr, 0, 4);
 }
+#endif
 
 /*
  * Define the register sets available on OpenRISC under Linux
  */
 enum or1k_regset {
 	REGSET_GENERAL,
+#ifdef CONFIG_FPU
 	REGSET_FPU,
+#endif
 };
 
 static const struct user_regset or1k_regsets[] = {
@@ -133,6 +131,7 @@ static const struct user_regset or1k_regsets[] = {
 			    .regset_get = genregs_get,
 			    .set = genregs_set,
 			    },
+#ifdef CONFIG_FPU
 	[REGSET_FPU] = {
 			    .core_note_type = NT_PRFPREG,
 			    .n = sizeof(struct __or1k_fpu_state) / sizeof(long),
@@ -141,6 +140,7 @@ static const struct user_regset or1k_regsets[] = {
 			    .regset_get = fpregs_get,
 			    .set = fpregs_set,
 			    },
+#endif
 };
 
 static const struct user_regset_view user_or1k_native_view = {
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index e2f21a5d8ad9..c7ab42e2cb7a 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/stddef.h>
 #include <linux/resume_user_mode.h>
 
+#include <asm/fpu.h>
 #include <asm/processor.h>
 #include <asm/syscall.h>
 #include <asm/ucontext.h>
@@ -39,6 +40,37 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs);
 asmlinkage int do_work_pending(struct pt_regs *regs, unsigned int thread_flags,
 			       int syscall);
 
+#ifdef CONFIG_FPU
+static long restore_fp_state(struct sigcontext __user *sc)
+{
+	long err;
+
+	err = __copy_from_user(&current->thread.fpcsr, &sc->fpcsr, sizeof(unsigned long));
+	if (unlikely(err))
+		return err;
+
+	/* Restore the FPU state */
+	restore_fpu(current);
+
+	return 0;
+}
+
+static long save_fp_state(struct sigcontext __user *sc)
+{
+	long err;
+
+	/* Sync the user FPU state so we can copy to sigcontext */
+	save_fpu(current);
+
+	err = __copy_to_user(&sc->fpcsr, &current->thread.fpcsr, sizeof(unsigned long));
+
+	return err;
+}
+#else
+#define save_fp_state(sc) (0)
+#define restore_fp_state(sc) (0)
+#endif
+
 static int restore_sigcontext(struct pt_regs *regs,
 			      struct sigcontext __user *sc)
 {
@@ -55,7 +87,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 	err |= __copy_from_user(regs, sc->regs.gpr, 32 * sizeof(unsigned long));
 	err |= __copy_from_user(&regs->pc, &sc->regs.pc, sizeof(unsigned long));
 	err |= __copy_from_user(&regs->sr, &sc->regs.sr, sizeof(unsigned long));
-	err |= __copy_from_user(&regs->fpcsr, &sc->fpcsr, sizeof(unsigned long));
+	err |= restore_fp_state(sc);
 
 	/* make sure the SM-bit is cleared so user-mode cannot fool us */
 	regs->sr &= ~SPR_SR_SM;
@@ -118,7 +150,7 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 	err |= __copy_to_user(sc->regs.gpr, regs, 32 * sizeof(unsigned long));
 	err |= __copy_to_user(&sc->regs.pc, &regs->pc, sizeof(unsigned long));
 	err |= __copy_to_user(&sc->regs.sr, &regs->sr, sizeof(unsigned long));
-	err |= __copy_to_user(&sc->fpcsr, &regs->fpcsr, sizeof(unsigned long));
+	err |= save_fp_state(sc);
 
 	return err;
 }
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 9370888c9a7e..c195be9cc9fc 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -31,6 +31,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/bug.h>
+#include <asm/fpu.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/unwinder.h>
@@ -51,16 +52,16 @@ static void print_trace(void *data, unsigned long addr, int reliable)
 {
 	const char *loglvl = data;
 
-	printk("%s[<%p>] %s%pS\n", loglvl, (void *) addr, reliable ? "" : "? ",
-	       (void *) addr);
+	pr_info("%s[<%p>] %s%pS\n", loglvl, (void *) addr, reliable ? "" : "? ",
+		(void *) addr);
 }
 
 static void print_data(unsigned long base_addr, unsigned long word, int i)
 {
 	if (i == 0)
-		printk("(%08lx:)\t%08lx", base_addr + (i * 4), word);
+		pr_info("(%08lx:)\t%08lx", base_addr + (i * 4), word);
 	else
-		printk(" %08lx:\t%08lx", base_addr + (i * 4), word);
+		pr_info(" %08lx:\t%08lx", base_addr + (i * 4), word);
 }
 
 /* displays a short stack trace */
@@ -69,7 +70,7 @@ void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl
 	if (esp == NULL)
 		esp = (unsigned long *)&esp;
 
-	printk("%sCall trace:\n", loglvl);
+	pr_info("%sCall trace:\n", loglvl);
 	unwind_stack((void *)loglvl, esp, print_trace);
 }
 
@@ -83,57 +84,56 @@ void show_registers(struct pt_regs *regs)
 	if (user_mode(regs))
 		in_kernel = 0;
 
-	printk("CPU #: %d\n"
-	       "   PC: %08lx    SR: %08lx    SP: %08lx FPCSR: %08lx\n",
-	       smp_processor_id(), regs->pc, regs->sr, regs->sp,
-	       regs->fpcsr);
-	printk("GPR00: %08lx GPR01: %08lx GPR02: %08lx GPR03: %08lx\n",
-	       0L, regs->gpr[1], regs->gpr[2], regs->gpr[3]);
-	printk("GPR04: %08lx GPR05: %08lx GPR06: %08lx GPR07: %08lx\n",
-	       regs->gpr[4], regs->gpr[5], regs->gpr[6], regs->gpr[7]);
-	printk("GPR08: %08lx GPR09: %08lx GPR10: %08lx GPR11: %08lx\n",
-	       regs->gpr[8], regs->gpr[9], regs->gpr[10], regs->gpr[11]);
-	printk("GPR12: %08lx GPR13: %08lx GPR14: %08lx GPR15: %08lx\n",
-	       regs->gpr[12], regs->gpr[13], regs->gpr[14], regs->gpr[15]);
-	printk("GPR16: %08lx GPR17: %08lx GPR18: %08lx GPR19: %08lx\n",
-	       regs->gpr[16], regs->gpr[17], regs->gpr[18], regs->gpr[19]);
-	printk("GPR20: %08lx GPR21: %08lx GPR22: %08lx GPR23: %08lx\n",
-	       regs->gpr[20], regs->gpr[21], regs->gpr[22], regs->gpr[23]);
-	printk("GPR24: %08lx GPR25: %08lx GPR26: %08lx GPR27: %08lx\n",
-	       regs->gpr[24], regs->gpr[25], regs->gpr[26], regs->gpr[27]);
-	printk("GPR28: %08lx GPR29: %08lx GPR30: %08lx GPR31: %08lx\n",
-	       regs->gpr[28], regs->gpr[29], regs->gpr[30], regs->gpr[31]);
-	printk("  RES: %08lx oGPR11: %08lx\n",
-	       regs->gpr[11], regs->orig_gpr11);
-
-	printk("Process %s (pid: %d, stackpage=%08lx)\n",
-	       current->comm, current->pid, (unsigned long)current);
+	pr_info("CPU #: %d\n"
+		"   PC: %08lx    SR: %08lx    SP: %08lx\n",
+		smp_processor_id(), regs->pc, regs->sr, regs->sp);
+	pr_info("GPR00: %08lx GPR01: %08lx GPR02: %08lx GPR03: %08lx\n",
+		0L, regs->gpr[1], regs->gpr[2], regs->gpr[3]);
+	pr_info("GPR04: %08lx GPR05: %08lx GPR06: %08lx GPR07: %08lx\n",
+		regs->gpr[4], regs->gpr[5], regs->gpr[6], regs->gpr[7]);
+	pr_info("GPR08: %08lx GPR09: %08lx GPR10: %08lx GPR11: %08lx\n",
+		regs->gpr[8], regs->gpr[9], regs->gpr[10], regs->gpr[11]);
+	pr_info("GPR12: %08lx GPR13: %08lx GPR14: %08lx GPR15: %08lx\n",
+		regs->gpr[12], regs->gpr[13], regs->gpr[14], regs->gpr[15]);
+	pr_info("GPR16: %08lx GPR17: %08lx GPR18: %08lx GPR19: %08lx\n",
+		regs->gpr[16], regs->gpr[17], regs->gpr[18], regs->gpr[19]);
+	pr_info("GPR20: %08lx GPR21: %08lx GPR22: %08lx GPR23: %08lx\n",
+		regs->gpr[20], regs->gpr[21], regs->gpr[22], regs->gpr[23]);
+	pr_info("GPR24: %08lx GPR25: %08lx GPR26: %08lx GPR27: %08lx\n",
+		regs->gpr[24], regs->gpr[25], regs->gpr[26], regs->gpr[27]);
+	pr_info("GPR28: %08lx GPR29: %08lx GPR30: %08lx GPR31: %08lx\n",
+		regs->gpr[28], regs->gpr[29], regs->gpr[30], regs->gpr[31]);
+	pr_info("  RES: %08lx oGPR11: %08lx\n",
+		regs->gpr[11], regs->orig_gpr11);
+
+	pr_info("Process %s (pid: %d, stackpage=%08lx)\n",
+		current->comm, current->pid, (unsigned long)current);
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
 	 */
 	if (in_kernel) {
 
-		printk("\nStack: ");
+		pr_info("\nStack: ");
 		show_stack(NULL, (unsigned long *)esp, KERN_EMERG);
 
 		if (esp < PAGE_OFFSET)
 			goto bad_stack;
 
-		printk("\n");
+		pr_info("\n");
 		for (i = -8; i < 24; i += 1) {
 			unsigned long word;
 
 			if (__get_user(word, &((unsigned long *)esp)[i])) {
 bad_stack:
-				printk(" Bad Stack value.");
+				pr_info(" Bad Stack value.");
 				break;
 			}
 
 			print_data(esp, word, i);
 		}
 
-		printk("\nCode: ");
+		pr_info("\nCode: ");
 		if (regs->pc < PAGE_OFFSET)
 			goto bad;
 
@@ -142,14 +142,14 @@ bad_stack:
 
 			if (__get_user(word, &((unsigned long *)regs->pc)[i])) {
 bad:
-				printk(" Bad PC value.");
+				pr_info(" Bad PC value.");
 				break;
 			}
 
 			print_data(regs->pc, word, i);
 		}
 	}
-	printk("\n");
+	pr_info("\n");
 }
 
 /* This is normally the 'Oops' routine */
@@ -157,10 +157,10 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err)
 {
 
 	console_verbose();
-	printk("\n%s#: %04lx\n", str, err & 0xffff);
+	pr_emerg("\n%s#: %04lx\n", str, err & 0xffff);
 	show_registers(regs);
 #ifdef CONFIG_JUMP_UPON_UNHANDLED_EXCEPTION
-	printk("\n\nUNHANDLED_EXCEPTION: entering infinite loop\n");
+	pr_emerg("\n\nUNHANDLED_EXCEPTION: entering infinite loop\n");
 
 	/* shut down interrupts */
 	local_irq_disable();
@@ -173,36 +173,51 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err)
 
 asmlinkage void unhandled_exception(struct pt_regs *regs, int ea, int vector)
 {
-	printk("Unable to handle exception at EA =0x%x, vector 0x%x",
-	       ea, vector);
+	pr_emerg("Unable to handle exception at EA =0x%x, vector 0x%x",
+		 ea, vector);
 	die("Oops", regs, 9);
 }
 
 asmlinkage void do_fpe_trap(struct pt_regs *regs, unsigned long address)
 {
-	int code = FPE_FLTUNK;
-	unsigned long fpcsr = regs->fpcsr;
-
-	if (fpcsr & SPR_FPCSR_IVF)
-		code = FPE_FLTINV;
-	else if (fpcsr & SPR_FPCSR_OVF)
-		code = FPE_FLTOVF;
-	else if (fpcsr & SPR_FPCSR_UNF)
-		code = FPE_FLTUND;
-	else if (fpcsr & SPR_FPCSR_DZF)
-		code = FPE_FLTDIV;
-	else if (fpcsr & SPR_FPCSR_IXF)
-		code = FPE_FLTRES;
-
-	/* Clear all flags */
-	regs->fpcsr &= ~SPR_FPCSR_ALLF;
-
-	force_sig_fault(SIGFPE, code, (void __user *)regs->pc);
+	if (user_mode(regs)) {
+		int code = FPE_FLTUNK;
+#ifdef CONFIG_FPU
+		unsigned long fpcsr;
+
+		save_fpu(current);
+		fpcsr = current->thread.fpcsr;
+
+		if (fpcsr & SPR_FPCSR_IVF)
+			code = FPE_FLTINV;
+		else if (fpcsr & SPR_FPCSR_OVF)
+			code = FPE_FLTOVF;
+		else if (fpcsr & SPR_FPCSR_UNF)
+			code = FPE_FLTUND;
+		else if (fpcsr & SPR_FPCSR_DZF)
+			code = FPE_FLTDIV;
+		else if (fpcsr & SPR_FPCSR_IXF)
+			code = FPE_FLTRES;
+
+		/* Clear all flags */
+		current->thread.fpcsr &= ~SPR_FPCSR_ALLF;
+		restore_fpu(current);
+#endif
+		force_sig_fault(SIGFPE, code, (void __user *)regs->pc);
+	} else {
+		pr_emerg("KERNEL: Illegal fpe exception 0x%.8lx\n", regs->pc);
+		die("Die:", regs, SIGFPE);
+	}
 }
 
 asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 {
-	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc);
+	if (user_mode(regs)) {
+		force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc);
+	} else {
+		pr_emerg("KERNEL: Illegal trap exception 0x%.8lx\n", regs->pc);
+		die("Die:", regs, SIGILL);
+	}
 }
 
 asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
@@ -211,8 +226,7 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 		/* Send a SIGBUS */
 		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address);
 	} else {
-		printk("KERNEL: Unaligned Access 0x%.8lx\n", address);
-		show_registers(regs);
+		pr_emerg("KERNEL: Unaligned Access 0x%.8lx\n", address);
 		die("Die:", regs, address);
 	}
 
@@ -224,8 +238,7 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
 		/* Send a SIGBUS */
 		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 	} else {		/* Kernel mode */
-		printk("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address);
-		show_registers(regs);
+		pr_emerg("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address);
 		die("Die:", regs, address);
 	}
 }
@@ -419,9 +432,8 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 		/* Send a SIGILL */
 		force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address);
 	} else {		/* Kernel mode */
-		printk("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n",
-		       address);
-		show_registers(regs);
+		pr_emerg("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n",
+			 address);
 		die("Die:", regs, address);
 	}
 }
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index ee4febb30386..5ce258f3fffa 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -131,7 +131,7 @@ CONFIG_PPDEV=m
 CONFIG_I2C=y
 CONFIG_HWMON=m
 CONFIG_DRM=m
-CONFIG_DRM_DP_CEC=y
+CONFIG_DRM_DISPLAY_DP_AUX_CEC=y
 # CONFIG_DRM_I2C_CH7006 is not set
 # CONFIG_DRM_I2C_SIL164 is not set
 CONFIG_DRM_RADEON=m
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 444154271f23..800eb64e91ad 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct *idle)
 		struct irq_desc *desc = irq_to_desc(i);
 
 		if (desc && desc->kstat_irqs)
-			*per_cpu_ptr(desc->kstat_irqs, cpuid) = 0;
+			*per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct irqstat) { };
 	}
 #endif
 
diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c
index d6ee2fd45550..979f45d4d1fb 100644
--- a/arch/parisc/net/bpf_jit_core.c
+++ b/arch/parisc/net/bpf_jit_core.c
@@ -167,7 +167,13 @@ skip_init_ctx:
 	bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
 
 	if (!prog->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(jit_data->header);
+		if (bpf_jit_binary_lock_ro(jit_data->header)) {
+			bpf_jit_binary_free(jit_data->header);
+			prog->bpf_func = NULL;
+			prog->jited = 0;
+			prog->jited_len = 0;
+			goto out_offset;
+		}
 		prologue_len = ctx->epilogue_offset - ctx->body_len;
 		for (i = 0; i < prog->len; i++)
 			ctx->offset[i] += prologue_len;
diff --git a/arch/powerpc/configs/adder875_defconfig b/arch/powerpc/configs/adder875_defconfig
index 7f35d5bc1229..97f4d4851735 100644
--- a/arch/powerpc/configs/adder875_defconfig
+++ b/arch/powerpc/configs/adder875_defconfig
@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/powerpc/configs/ep88xc_defconfig b/arch/powerpc/configs/ep88xc_defconfig
index a98ef6a4abef..50cc59eb36cf 100644
--- a/arch/powerpc/configs/ep88xc_defconfig
+++ b/arch/powerpc/configs/ep88xc_defconfig
@@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/powerpc/configs/mpc866_ads_defconfig b/arch/powerpc/configs/mpc866_ads_defconfig
index 5c56d36cdfc5..6f449411abf7 100644
--- a/arch/powerpc/configs/mpc866_ads_defconfig
+++ b/arch/powerpc/configs/mpc866_ads_defconfig
@@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_EXPERT=y
 # CONFIG_BUG is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_EPOLL is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig
index 56b876e418e9..77306be62e9e 100644
--- a/arch/powerpc/configs/mpc885_ads_defconfig
+++ b/arch/powerpc/configs/mpc885_ads_defconfig
@@ -7,7 +7,7 @@ CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/powerpc/configs/tqm8xx_defconfig b/arch/powerpc/configs/tqm8xx_defconfig
index 083c2e57520a..383c0966e92f 100644
--- a/arch/powerpc/configs/tqm8xx_defconfig
+++ b/arch/powerpc/configs/tqm8xx_defconfig
@@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_EXPERT=y
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
 CONFIG_MODULES=y
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 78302f6c2580..c6390890a60c 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -13,6 +13,17 @@
 
 #define VDSO_HAS_TIME			1
 
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */
+#define VDSO_DELTA_NOMASK		1
+
 static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3,
 					const unsigned long _r4)
 {
@@ -104,21 +115,6 @@ static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
 }
 #define vdso_clocksource_ok vdso_clocksource_ok
 
-/*
- * powerpc specific delta calculation.
- *
- * This variant removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
- */
-static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
-	return (cycles - last) * mult;
-}
-#define vdso_calc_delta vdso_calc_delta
-
 #ifndef __powerpc64__
 static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e42984878503..f2636414d82a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
  */
 static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 {
-	this_cpu_inc_rm(desc->kstat_irqs);
+	this_cpu_inc_rm(&desc->kstat_irqs->cnt);
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index be09c8836d56..9e87287942dc 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -173,6 +173,8 @@ config RISCV
 	select PCI_DOMAINS_GENERIC if PCI
 	select PCI_MSI if PCI
 	select RISCV_ALTERNATIVE if !XIP_KERNEL
+	select RISCV_APLIC
+	select RISCV_IMSIC
 	select RISCV_INTC
 	select RISCV_TIMER if RISCV_SBI
 	select SIFIVE_PLIC
diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig
index 2552e78074a3..af9601da4643 100644
--- a/arch/riscv/configs/nommu_k210_defconfig
+++ b/arch/riscv/configs/nommu_k210_defconfig
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_SYSFS_SYSCALL is not set
 # CONFIG_FHANDLE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig
index 8f67fb830585..dd460c649152 100644
--- a/arch/riscv/configs/nommu_k210_sdcard_defconfig
+++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig
@@ -3,7 +3,7 @@ CONFIG_LOG_BUF_SHIFT=13
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_SYSFS_SYSCALL is not set
 # CONFIG_FHANDLE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig
index de8143d1f738..d4b03dc3c2c0 100644
--- a/arch/riscv/configs/nommu_virt_defconfig
+++ b/arch/riscv/configs/nommu_virt_defconfig
@@ -10,7 +10,7 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
 # CONFIG_FHANDLE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
 # CONFIG_TIMERFD is not set
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index f4b6b3b9edda..fdbf88ca8b70 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -81,6 +81,8 @@ struct rv_jit_context {
 	int nexentries;
 	unsigned long flags;
 	int stack_size;
+	u64 arena_vm_start;
+	u64 user_vm_start;
 };
 
 /* Convert from ninsns to bytes. */
@@ -606,7 +608,7 @@ static inline u32 rv_nop(void)
 	return rv_i_insn(0, 0, 0, 0, 0x13);
 }
 
-/* RVC instrutions. */
+/* RVC instructions. */
 
 static inline u16 rvc_addi4spn(u8 rd, u32 imm10)
 {
@@ -735,7 +737,7 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2)
 	return rv_css_insn(0x6, imm, rs2, 0x2);
 }
 
-/* RVZBB instrutions. */
+/* RVZBB instructions. */
 static inline u32 rvzbb_sextb(u8 rd, u8 rs1)
 {
 	return rv_i_insn(0x604, rs1, 1, rd, 0x13);
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index ec9d692838fc..79a001d5533e 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -12,12 +12,14 @@
 #include <linux/stop_machine.h>
 #include <asm/patch.h>
 #include <asm/cfi.h>
+#include <asm/percpu.h>
 #include "bpf_jit.h"
 
 #define RV_FENTRY_NINSNS 2
 
 #define RV_REG_TCC RV_REG_A6
 #define RV_REG_TCC_SAVED RV_REG_S6 /* Store A6 in S6 if program do calls */
+#define RV_REG_ARENA RV_REG_S7 /* For storing arena_vm_start */
 
 static const int regmap[] = {
 	[BPF_REG_0] =	RV_REG_A5,
@@ -255,6 +257,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
 		emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
 		store_offset -= 8;
 	}
+	if (ctx->arena_vm_start) {
+		emit_ld(RV_REG_ARENA, store_offset, RV_REG_SP, ctx);
+		store_offset -= 8;
+	}
 
 	emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
 	/* Set return value. */
@@ -498,33 +504,33 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
 		break;
 	/* src_reg = atomic_fetch_<op>(dst_reg + off16, src_reg) */
 	case BPF_ADD | BPF_FETCH:
-		emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) :
-		     rv_amoadd_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) :
+		     rv_amoadd_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_AND | BPF_FETCH:
-		emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) :
-		     rv_amoand_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) :
+		     rv_amoand_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_OR | BPF_FETCH:
-		emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) :
-		     rv_amoor_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) :
+		     rv_amoor_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	case BPF_XOR | BPF_FETCH:
-		emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) :
-		     rv_amoxor_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) :
+		     rv_amoxor_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
 	/* src_reg = atomic_xchg(dst_reg + off16, src_reg); */
 	case BPF_XCHG:
-		emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) :
-		     rv_amoswap_w(rs, rs, rd, 0, 0), ctx);
+		emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) :
+		     rv_amoswap_w(rs, rs, rd, 1, 1), ctx);
 		if (!is64)
 			emit_zextw(rs, rs, ctx);
 		break;
@@ -548,6 +554,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
 
 #define BPF_FIXUP_OFFSET_MASK   GENMASK(26, 0)
 #define BPF_FIXUP_REG_MASK      GENMASK(31, 27)
+#define REG_DONT_CLEAR_MARKER	0	/* RV_REG_ZERO unused in pt_regmap */
 
 bool ex_handler_bpf(const struct exception_table_entry *ex,
 		    struct pt_regs *regs)
@@ -555,7 +562,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
 	off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
 	int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
 
-	*(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
+	if (regs_offset != REG_DONT_CLEAR_MARKER)
+		*(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
 	regs->epc = (unsigned long)&ex->fixup - offset;
 
 	return true;
@@ -572,7 +580,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
 	off_t fixup_offset;
 
 	if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
-	    (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
+	    (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
+	     BPF_MODE(insn->code) != BPF_PROBE_MEM32))
 		return 0;
 
 	if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
@@ -1073,6 +1082,33 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 	/* dst = src */
 	case BPF_ALU | BPF_MOV | BPF_X:
 	case BPF_ALU64 | BPF_MOV | BPF_X:
+		if (insn_is_cast_user(insn)) {
+			emit_mv(RV_REG_T1, rs, ctx);
+			emit_zextw(RV_REG_T1, RV_REG_T1, ctx);
+			emit_imm(rd, (ctx->user_vm_start >> 32) << 32, ctx);
+			emit(rv_beq(RV_REG_T1, RV_REG_ZERO, 4), ctx);
+			emit_or(RV_REG_T1, rd, RV_REG_T1, ctx);
+			emit_mv(rd, RV_REG_T1, ctx);
+			break;
+		} else if (insn_is_mov_percpu_addr(insn)) {
+			if (rd != rs)
+				emit_mv(rd, rs, ctx);
+#ifdef CONFIG_SMP
+			/* Load current CPU number in T1 */
+			emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu),
+				RV_REG_TP, ctx);
+			/* << 3 because offsets are 8 bytes */
+			emit_slli(RV_REG_T1, RV_REG_T1, 3, ctx);
+			/* Load address of __per_cpu_offset array in T2 */
+			emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx);
+			/* Add offset of current CPU to  __per_cpu_offset */
+			emit_add(RV_REG_T1, RV_REG_T2, RV_REG_T1, ctx);
+			/* Load __per_cpu_offset[cpu] in T1 */
+			emit_ld(RV_REG_T1, 0, RV_REG_T1, ctx);
+			/* Add the offset to Rd */
+			emit_add(rd, rd, RV_REG_T1, ctx);
+#endif
+		}
 		if (imm == 1) {
 			/* Special mov32 for zext */
 			emit_zextw(rd, rd, ctx);
@@ -1457,6 +1493,22 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		bool fixed_addr;
 		u64 addr;
 
+		/* Inline calls to bpf_get_smp_processor_id()
+		 *
+		 * RV_REG_TP holds the address of the current CPU's task_struct and thread_info is
+		 * at offset 0 in task_struct.
+		 * Load cpu from thread_info:
+		 *     Set R0 to ((struct thread_info *)(RV_REG_TP))->cpu
+		 *
+		 * This replicates the implementation of raw_smp_processor_id() on RISCV
+		 */
+		if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) {
+			/* Load current CPU number in R0 */
+			emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu),
+				RV_REG_TP, ctx);
+			break;
+		}
+
 		mark_call(ctx);
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &addr, &fixed_addr);
@@ -1539,6 +1591,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
 	case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
+	/* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + RV_REG_ARENA + off) */
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
 	{
 		int insn_len, insns_start;
 		bool sign_ext;
@@ -1546,6 +1603,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
 			   BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
 
+		if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+			emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx);
+			rs = RV_REG_T2;
+		}
+
 		switch (BPF_SIZE(code)) {
 		case BPF_B:
 			if (is_12b_int(off)) {
@@ -1682,6 +1744,86 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
 		break;
 
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+	{
+		int insn_len, insns_start;
+
+		emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx);
+		rd = RV_REG_T3;
+
+		/* Load imm to a register then store it */
+		emit_imm(RV_REG_T1, imm, ctx);
+
+		switch (BPF_SIZE(code)) {
+		case BPF_B:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit(rv_sb(rd, off, RV_REG_T1), ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T2, off, ctx);
+			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_H:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit(rv_sh(rd, off, RV_REG_T1), ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T2, off, ctx);
+			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_W:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit_sw(rd, off, RV_REG_T1, ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T2, off, ctx);
+			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_DW:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit_sd(rd, off, RV_REG_T1, ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T2, off, ctx);
+			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		}
+
+		ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER,
+					    insn_len);
+		if (ret)
+			return ret;
+
+		break;
+	}
+
 	/* STX: *(size *)(dst + off) = src */
 	case BPF_STX | BPF_MEM | BPF_B:
 		if (is_12b_int(off)) {
@@ -1728,6 +1870,84 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		emit_atomic(rd, rs, off, imm,
 			    BPF_SIZE(code) == BPF_DW, ctx);
 		break;
+
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+	case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+	{
+		int insn_len, insns_start;
+
+		emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx);
+		rd = RV_REG_T2;
+
+		switch (BPF_SIZE(code)) {
+		case BPF_B:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit(rv_sb(rd, off, rs), ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T1, off, ctx);
+			emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit(rv_sb(RV_REG_T1, 0, rs), ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_H:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit(rv_sh(rd, off, rs), ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T1, off, ctx);
+			emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit(rv_sh(RV_REG_T1, 0, rs), ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_W:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit_sw(rd, off, rs, ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T1, off, ctx);
+			emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit_sw(RV_REG_T1, 0, rs, ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		case BPF_DW:
+			if (is_12b_int(off)) {
+				insns_start = ctx->ninsns;
+				emit_sd(rd, off, rs, ctx);
+				insn_len = ctx->ninsns - insns_start;
+				break;
+			}
+
+			emit_imm(RV_REG_T1, off, ctx);
+			emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+			insns_start = ctx->ninsns;
+			emit_sd(RV_REG_T1, 0, rs, ctx);
+			insn_len = ctx->ninsns - insns_start;
+			break;
+		}
+
+		ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER,
+					    insn_len);
+		if (ret)
+			return ret;
+
+		break;
+	}
+
 	default:
 		pr_err("bpf-jit: unknown opcode %02x\n", code);
 		return -EINVAL;
@@ -1759,6 +1979,8 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 		stack_adjust += 8;
 	if (seen_reg(RV_REG_S6, ctx))
 		stack_adjust += 8;
+	if (ctx->arena_vm_start)
+		stack_adjust += 8;
 
 	stack_adjust = round_up(stack_adjust, 16);
 	stack_adjust += bpf_stack_adjust;
@@ -1810,6 +2032,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 		emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx);
 		store_offset -= 8;
 	}
+	if (ctx->arena_vm_start) {
+		emit_sd(RV_REG_SP, store_offset, RV_REG_ARENA, ctx);
+		store_offset -= 8;
+	}
 
 	emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx);
 
@@ -1823,6 +2049,9 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 		emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx);
 
 	ctx->stack_size = stack_adjust;
+
+	if (ctx->arena_vm_start)
+		emit_imm(RV_REG_ARENA, ctx->arena_vm_start, ctx);
 }
 
 void bpf_jit_build_epilogue(struct rv_jit_context *ctx)
@@ -1839,3 +2068,23 @@ bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
 }
+
+bool bpf_jit_supports_arena(void)
+{
+	return true;
+}
+
+bool bpf_jit_supports_percpu_insn(void)
+{
+	return true;
+}
+
+bool bpf_jit_inlines_helper_call(s32 imm)
+{
+	switch (imm) {
+	case BPF_FUNC_get_smp_processor_id:
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index e238fdbd5dbc..0a96abdaca65 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -80,6 +80,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto skip_init_ctx;
 	}
 
+	ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
+	ctx->user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
 	ctx->prog = prog;
 	ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
 	if (!ctx->offset) {
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index db84942eb78f..7937765ccfa5 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -6,16 +6,13 @@
 
 #define VDSO_HAS_CLOCK_GETRES 1
 
+#define VDSO_DELTA_NOMASK 1
+
 #include <asm/syscall.h>
 #include <asm/timex.h>
 #include <asm/unistd.h>
 #include <linux/compiler.h>
 
-#define vdso_calc_delta __arch_vdso_calc_delta
-static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
-	return (cycles - last) * mult;
-}
 
 static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
 {
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 5af0402e94b8..4be8f5cadd02 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1427,8 +1427,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 	EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64),		\
 		      (insn->imm & BPF_FETCH) ? src_reg : REG_W0,	\
 		      src_reg, dst_reg, off);				\
-	if (is32 && (insn->imm & BPF_FETCH))				\
-		EMIT_ZERO(src_reg);					\
+	if (insn->imm & BPF_FETCH) {					\
+		/* bcr 14,0 - see atomic_fetch_{add,and,or,xor}() */	\
+		_EMIT2(0x07e0);						\
+		if (is32)                                               \
+			EMIT_ZERO(src_reg);				\
+	}								\
 } while (0)
 		case BPF_ADD:
 		case BPF_ADD | BPF_FETCH:
@@ -2108,7 +2112,11 @@ skip_init_ctx:
 		print_fn_code(jit.prg_buf, jit.size_prg);
 	}
 	if (!fp->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(header);
+		if (bpf_jit_binary_lock_ro(header)) {
+			bpf_jit_binary_free(header);
+			fp = orig_fp;
+			goto free_addrs;
+		}
 	} else {
 		jit_data->header = header;
 		jit_data->ctx = jit;
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2ad3e29f0ebe..6bc60f964f96 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -125,7 +125,8 @@ config ARCH_HAS_ILOG2_U64
 
 config NO_IOPORT_MAP
 	def_bool !PCI
-	depends on !SH_SHMIN && !SH_HP6XX && !SH_SOLUTION_ENGINE
+	depends on !SH_SHMIN && !SH_HP6XX && !SH_SOLUTION_ENGINE && \
+		   !SH_DREAMCAST
 
 config IO_TRAPPED
 	bool
diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c
index f39c8196efdf..689ea14a6678 100644
--- a/arch/sh/boards/board-sh7757lcr.c
+++ b/arch/sh/boards/board-sh7757lcr.c
@@ -569,7 +569,7 @@ static int __init sh7757lcr_devices_setup(void)
 arch_initcall(sh7757lcr_devices_setup);
 
 /* Initialize IRQ setting */
-void __init init_sh7757lcr_IRQ(void)
+static void __init init_sh7757lcr_IRQ(void)
 {
 	plat_irq_setup_pins(IRQ_MODE_IRQ7654);
 	plat_irq_setup_pins(IRQ_MODE_IRQ3210);
diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 77dad1e511b4..25c4968f0d8b 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -295,7 +295,7 @@ static int __init sh7785lcr_devices_setup(void)
 device_initcall(sh7785lcr_devices_setup);
 
 /* Initialize IRQ setting */
-void __init init_sh7785lcr_IRQ(void)
+static void __init init_sh7785lcr_IRQ(void)
 {
 	plat_irq_setup_pins(IRQ_MODE_IRQ7654);
 	plat_irq_setup_pins(IRQ_MODE_IRQ3210);
diff --git a/arch/sh/boards/mach-dreamcast/setup.c b/arch/sh/boards/mach-dreamcast/setup.c
index 2d966c1c2cc1..daa8455549fa 100644
--- a/arch/sh/boards/mach-dreamcast/setup.c
+++ b/arch/sh/boards/mach-dreamcast/setup.c
@@ -25,10 +25,13 @@
 #include <asm/irq.h>
 #include <asm/rtc.h>
 #include <asm/machvec.h>
+#include <cpu/addrspace.h>
 #include <mach/sysasic.h>
 
 static void __init dreamcast_setup(char **cmdline_p)
 {
+	/* GAPS PCI bridge assumes P2 area relative addresses. */
+	__set_io_port_base(P2SEG);
 }
 
 static struct sh_machine_vector mv_dreamcast __initmv = {
diff --git a/arch/sh/boards/mach-highlander/pinmux-r7785rp.c b/arch/sh/boards/mach-highlander/pinmux-r7785rp.c
index 703179faf652..689bd8732d9e 100644
--- a/arch/sh/boards/mach-highlander/pinmux-r7785rp.c
+++ b/arch/sh/boards/mach-highlander/pinmux-r7785rp.c
@@ -5,6 +5,7 @@
 #include <linux/init.h>
 #include <linux/gpio.h>
 #include <cpu/sh7785.h>
+#include <mach/highlander.h>
 
 void __init highlander_plat_pinmux_setup(void)
 {
diff --git a/arch/sh/boards/mach-sh03/rtc.c b/arch/sh/boards/mach-sh03/rtc.c
index 7fb474844a2d..bc6cf995128c 100644
--- a/arch/sh/boards/mach-sh03/rtc.c
+++ b/arch/sh/boards/mach-sh03/rtc.c
@@ -120,7 +120,7 @@ static int set_rtc_mmss(struct rtc_time *tm)
 	return retval;
 }
 
-int sh03_rtc_settimeofday(struct device *dev, struct rtc_time *tm)
+static int sh03_rtc_settimeofday(struct device *dev, struct rtc_time *tm)
 {
 	return set_rtc_mmss(tm);
 }
diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c
index f7f3e618e85b..cc88cb8908cc 100644
--- a/arch/sh/boards/of-generic.c
+++ b/arch/sh/boards/of-generic.c
@@ -10,6 +10,8 @@
 #include <linux/of_fdt.h>
 #include <linux/clocksource.h>
 #include <linux/irqchip.h>
+
+#include <asm/clock.h>
 #include <asm/machvec.h>
 #include <asm/rtc.h>
 
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index 6c6c791a1d06..54efed53c891 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -5,7 +5,7 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-OBJECTS := head_32.o misc.o cache.o piggy.o \
+OBJECTS := head_32.o misc.o piggy.o \
            ashiftrt.o ashldi3.o ashrsi3.o ashlsi3.o lshrsi3.o
 
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 \
diff --git a/arch/sh/boot/compressed/cache.c b/arch/sh/boot/compressed/cache.c
deleted file mode 100644
index 31e04ff4841e..000000000000
--- a/arch/sh/boot/compressed/cache.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-int cache_control(unsigned int command)
-{
-	volatile unsigned int *p = (volatile unsigned int *) 0x80000000;
-	int i;
-
-	for (i = 0; i < (32 * 1024); i += 32) {
-		(void)*p;
-		p += (32 / sizeof(int));
-	}
-
-	return 0;
-}
diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index ca05c99a3d5b..3690379cc86b 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -16,6 +16,8 @@
 #include <asm/addrspace.h>
 #include <asm/page.h>
 
+#include "misc.h"
+
 /*
  * gzip declarations
  */
@@ -26,11 +28,6 @@
 #undef memcpy
 #define memzero(s, n)     memset ((s), 0, (n))
 
-/* cache.c */
-#define CACHE_ENABLE      0
-#define CACHE_DISABLE     1
-int cache_control(unsigned int command);
-
 extern char input_data[];
 extern int input_len;
 static unsigned char *output;
@@ -139,8 +136,6 @@ void decompress_kernel(void)
 	free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
 
 	puts("Uncompressing Linux... ");
-	cache_control(CACHE_ENABLE);
 	__decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
-	cache_control(CACHE_DISABLE);
 	puts("Ok, booting the kernel.\n");
 }
diff --git a/arch/sh/boot/compressed/misc.h b/arch/sh/boot/compressed/misc.h
new file mode 100644
index 000000000000..2b4534faa305
--- /dev/null
+++ b/arch/sh/boot/compressed/misc.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MISC_H
+#define MISC_H
+
+void arch_ftrace_ops_list_func(void);
+void decompress_kernel(void);
+void ftrace_stub(void);
+
+#endif /* MISC_H */
diff --git a/arch/sh/boot/dts/j2_mimas_v2.dts b/arch/sh/boot/dts/j2_mimas_v2.dts
index fa9562f78d53..faf884f53804 100644
--- a/arch/sh/boot/dts/j2_mimas_v2.dts
+++ b/arch/sh/boot/dts/j2_mimas_v2.dts
@@ -71,8 +71,6 @@
 			#address-cells = <1>;
 			#size-cells = <0>;
 
-			spi-max-frequency = <25000000>;
-
 			reg = <0x40 0x8>;
 
 			sdcard@0 {
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index 9ee35269bee2..ab3bf72264df 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -6,7 +6,7 @@
 # CONFIG_PRINTK is not set
 # CONFIG_BUG is not set
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SIGNALFD is not set
diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig
index 14d0f5ead502..4765966fec99 100644
--- a/arch/sh/configs/se7619_defconfig
+++ b/arch/sh/configs/se7619_defconfig
@@ -4,7 +4,7 @@ CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_KALLSYMS is not set
 # CONFIG_HOTPLUG is not set
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index dc854293da43..20f07aee5bde 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -7,7 +7,7 @@ CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_BUG is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_SHMEM is not set
 CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index c891945b8a90..00862d3c030d 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -7,7 +7,7 @@ CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_BUG is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_SHMEM is not set
 CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index e078b193a78a..bfeb004f130e 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -5,7 +5,7 @@ CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_HOTPLUG is not set
 # CONFIG_BUG is not set
 # CONFIG_ELF_CORE is not set
-# CONFIG_BASE_FULL is not set
+CONFIG_BASE_SMALL=y
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SHMEM is not set
diff --git a/arch/sh/drivers/dma/dma-api.c b/arch/sh/drivers/dma/dma-api.c
index 89cd4a3b4cca..87e5a8928873 100644
--- a/arch/sh/drivers/dma/dma-api.c
+++ b/arch/sh/drivers/dma/dma-api.c
@@ -41,21 +41,6 @@ struct dma_info *get_dma_info(unsigned int chan)
 }
 EXPORT_SYMBOL(get_dma_info);
 
-struct dma_info *get_dma_info_by_name(const char *dmac_name)
-{
-	struct dma_info *info;
-
-	list_for_each_entry(info, &registered_dmac_list, list) {
-		if (dmac_name && (strcmp(dmac_name, info->name) != 0))
-			continue;
-		else
-			return info;
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(get_dma_info_by_name);
-
 static unsigned int get_nr_channels(void)
 {
 	struct dma_info *info;
@@ -101,93 +86,6 @@ int get_dma_residue(unsigned int chan)
 }
 EXPORT_SYMBOL(get_dma_residue);
 
-static int search_cap(const char **haystack, const char *needle)
-{
-	const char **p;
-
-	for (p = haystack; *p; p++)
-		if (strcmp(*p, needle) == 0)
-			return 1;
-
-	return 0;
-}
-
-/**
- * request_dma_bycap - Allocate a DMA channel based on its capabilities
- * @dmac: List of DMA controllers to search
- * @caps: List of capabilities
- *
- * Search all channels of all DMA controllers to find a channel which
- * matches the requested capabilities. The result is the channel
- * number if a match is found, or %-ENODEV if no match is found.
- *
- * Note that not all DMA controllers export capabilities, in which
- * case they can never be allocated using this API, and so
- * request_dma() must be used specifying the channel number.
- */
-int request_dma_bycap(const char **dmac, const char **caps, const char *dev_id)
-{
-	unsigned int found = 0;
-	struct dma_info *info;
-	const char **p;
-	int i;
-
-	BUG_ON(!dmac || !caps);
-
-	list_for_each_entry(info, &registered_dmac_list, list)
-		if (strcmp(*dmac, info->name) == 0) {
-			found = 1;
-			break;
-		}
-
-	if (!found)
-		return -ENODEV;
-
-	for (i = 0; i < info->nr_channels; i++) {
-		struct dma_channel *channel = &info->channels[i];
-
-		if (unlikely(!channel->caps))
-			continue;
-
-		for (p = caps; *p; p++) {
-			if (!search_cap(channel->caps, *p))
-				break;
-			if (request_dma(channel->chan, dev_id) == 0)
-				return channel->chan;
-		}
-	}
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL(request_dma_bycap);
-
-int dmac_search_free_channel(const char *dev_id)
-{
-	struct dma_channel *channel = { 0 };
-	struct dma_info *info = get_dma_info(0);
-	int i;
-
-	for (i = 0; i < info->nr_channels; i++) {
-		channel = &info->channels[i];
-		if (unlikely(!channel))
-			return -ENODEV;
-
-		if (atomic_read(&channel->busy) == 0)
-			break;
-	}
-
-	if (info->ops->request) {
-		int result = info->ops->request(channel);
-		if (result)
-			return result;
-
-		atomic_set(&channel->busy, 1);
-		return channel->chan;
-	}
-
-	return -ENOSYS;
-}
-
 int request_dma(unsigned int chan, const char *dev_id)
 {
 	struct dma_channel *channel = { 0 };
@@ -240,35 +138,6 @@ void dma_wait_for_completion(unsigned int chan)
 }
 EXPORT_SYMBOL(dma_wait_for_completion);
 
-int register_chan_caps(const char *dmac, struct dma_chan_caps *caps)
-{
-	struct dma_info *info;
-	unsigned int found = 0;
-	int i;
-
-	list_for_each_entry(info, &registered_dmac_list, list)
-		if (strcmp(dmac, info->name) == 0) {
-			found = 1;
-			break;
-		}
-
-	if (unlikely(!found))
-		return -ENODEV;
-
-	for (i = 0; i < info->nr_channels; i++, caps++) {
-		struct dma_channel *channel;
-
-		if ((info->first_channel_nr + i) != caps->ch_num)
-			return -EINVAL;
-
-		channel = &info->channels[i];
-		channel->caps = caps->caplist;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(register_chan_caps);
-
 void dma_configure_channel(unsigned int chan, unsigned long flags)
 {
 	struct dma_info *info = get_dma_info(chan);
@@ -294,18 +163,6 @@ int dma_xfer(unsigned int chan, unsigned long from,
 }
 EXPORT_SYMBOL(dma_xfer);
 
-int dma_extend(unsigned int chan, unsigned long op, void *param)
-{
-	struct dma_info *info = get_dma_info(chan);
-	struct dma_channel *channel = get_dma_channel(chan);
-
-	if (info->ops->extend)
-		return info->ops->extend(channel, op, param);
-
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(dma_extend);
-
 static int dma_proc_show(struct seq_file *m, void *v)
 {
 	struct dma_info *info = v;
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 6ecba5f521eb..362e4860bf52 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -91,7 +91,7 @@ err:
 	return ret;
 }
 
-static int switch_drv_remove(struct platform_device *pdev)
+static void switch_drv_remove(struct platform_device *pdev)
 {
 	struct push_switch *psw = platform_get_drvdata(pdev);
 	struct push_switch_platform_info *psw_info = pdev->dev.platform_data;
@@ -106,13 +106,11 @@ static int switch_drv_remove(struct platform_device *pdev)
 	free_irq(irq, pdev);
 
 	kfree(psw);
-
-	return 0;
 }
 
 static struct platform_driver switch_driver = {
 	.probe		= switch_drv_probe,
-	.remove		= switch_drv_remove,
+	.remove_new	= switch_drv_remove,
 	.driver		= {
 		.name	= DRV_NAME,
 	},
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 51112f54552b..e6642ff14889 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -104,6 +104,18 @@ void kunmap_coherent(void *kvaddr);
 
 void cpu_cache_init(void);
 
+void __weak l2_cache_init(void);
+
+void __weak j2_cache_init(void);
+void __weak sh2_cache_init(void);
+void __weak sh2a_cache_init(void);
+void __weak sh3_cache_init(void);
+void __weak shx3_cache_init(void);
+void __weak sh4_cache_init(void);
+void __weak sh7705_cache_init(void);
+
+void __weak sh4__flush_region_init(void);
+
 static inline void *sh_cacheop_vaddr(void *vaddr)
 {
 	if (__in_29bit_mode())
diff --git a/arch/sh/include/asm/dma.h b/arch/sh/include/asm/dma.h
index c8bee3f985a2..6b6d409956d1 100644
--- a/arch/sh/include/asm/dma.h
+++ b/arch/sh/include/asm/dma.h
@@ -56,7 +56,6 @@ struct dma_ops {
 	int (*get_residue)(struct dma_channel *chan);
 	int (*xfer)(struct dma_channel *chan);
 	int (*configure)(struct dma_channel *chan, unsigned long flags);
-	int (*extend)(struct dma_channel *chan, unsigned long op, void *param);
 };
 
 struct dma_channel {
@@ -118,8 +117,6 @@ extern int dma_xfer(unsigned int chan, unsigned long from,
 #define dma_read_page(chan, from, to)	\
 	dma_read(chan, from, to, PAGE_SIZE)
 
-extern int request_dma_bycap(const char **dmac, const char **caps,
-			     const char *dev_id);
 extern int get_dma_residue(unsigned int chan);
 extern struct dma_info *get_dma_info(unsigned int chan);
 extern struct dma_channel *get_dma_channel(unsigned int chan);
@@ -128,10 +125,6 @@ extern void dma_configure_channel(unsigned int chan, unsigned long flags);
 
 extern int register_dmac(struct dma_info *info);
 extern void unregister_dmac(struct dma_info *info);
-extern struct dma_info *get_dma_info_by_name(const char *dmac_name);
-
-extern int dma_extend(unsigned int chan, unsigned long op, void *param);
-extern int register_chan_caps(const char *dmac, struct dma_chan_caps *capslist);
 
 /* arch/sh/drivers/dma/dma-sysfs.c */
 extern int dma_create_sysfs_files(struct dma_channel *, struct dma_info *);
diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
index 04584be8986c..0379f4cce5ed 100644
--- a/arch/sh/include/asm/fpu.h
+++ b/arch/sh/include/asm/fpu.h
@@ -64,6 +64,9 @@ static inline void clear_fpu(struct task_struct *tsk, struct pt_regs *regs)
 	preempt_enable();
 }
 
+void float_raise(unsigned int flags);
+int float_rounding_mode(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_SH_FPU_H */
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index b1c1dc0cc261..1c10e1066390 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -33,6 +33,8 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr);
+
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
@@ -43,6 +45,14 @@ extern void *return_address(unsigned int);
 
 #define ftrace_return_address(n) return_address(n)
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
+#else
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_SH_FTRACE_H */
diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 361a0f57bdeb..74a438cea655 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -52,6 +52,8 @@ struct pmu;
 
 /* arch/sh/kernel/hw_breakpoint.c */
 extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+extern int arch_bp_generic_fields(int sh_len, int sh_type, int *gen_len,
+				  int *gen_type);
 extern int hw_breakpoint_arch_parse(struct perf_event *bp,
 				    const struct perf_event_attr *attr,
 				    struct arch_hw_breakpoint *hw);
diff --git a/arch/sh/include/asm/setup.h b/arch/sh/include/asm/setup.h
index fc807011187f..84bb23a771f3 100644
--- a/arch/sh/include/asm/setup.h
+++ b/arch/sh/include/asm/setup.h
@@ -21,5 +21,6 @@
 void sh_mv_setup(void);
 void check_for_initrd(void);
 void per_cpu_trap_init(void);
+void sh_fdt_init(phys_addr_t dt_phys);
 
 #endif /* _SH_SETUP_H */
diff --git a/arch/sh/include/asm/syscalls.h b/arch/sh/include/asm/syscalls.h
index 387105316d28..39240e06e8aa 100644
--- a/arch/sh/include/asm/syscalls.h
+++ b/arch/sh/include/asm/syscalls.h
@@ -8,6 +8,7 @@ asmlinkage int old_mmap(unsigned long addr, unsigned long len,
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff);
+asmlinkage int sys_cacheflush(unsigned long addr, unsigned long len, int op);
 
 #include <asm/syscalls_32.h>
 
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index aeb8915e9254..ddf324bfb9a0 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -24,6 +24,10 @@ static inline void tlb_unwire_entry(void)
 	BUG();
 }
 #endif /* CONFIG_CPU_SH4 */
+
+asmlinkage int handle_tlbmiss(struct pt_regs *regs, unsigned long error_code,
+			      unsigned long address);
+
 #endif /* CONFIG_MMU */
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_SH_TLB_H */
diff --git a/arch/sh/kernel/cpu/sh2a/opcode_helper.c b/arch/sh/kernel/cpu/sh2a/opcode_helper.c
index c509081d90b9..fcf53f5827eb 100644
--- a/arch/sh/kernel/cpu/sh2a/opcode_helper.c
+++ b/arch/sh/kernel/cpu/sh2a/opcode_helper.c
@@ -8,6 +8,8 @@
  */
 #include <linux/kernel.h>
 
+#include <asm/processor.h>
+
 /*
  * Instructions on SH are generally fixed at 16-bits, however, SH-2A
  * introduces some 32-bit instructions. Since there are no real
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 83ae1ad4a86e..d64d28c4f059 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -14,9 +14,12 @@
 #include <linux/sh_timer.h>
 #include <linux/sh_intc.h>
 #include <linux/io.h>
+
+#include <asm/cacheflush.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 #include <asm/platform_early.h>
+
 #include <cpu/sh7723.h>
 
 /* Serial */
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 0d990ab1ba2a..ef4b26a4b3d6 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <linux/notifier.h>
 
+#include <asm/cacheflush.h>
 #include <asm/suspend.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
index 67e330b7ea46..2ad19a0c5e04 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
@@ -17,8 +17,11 @@
 #include <linux/sh_dma.h>
 #include <linux/sh_intc.h>
 #include <linux/usb/ohci_pdriver.h>
+
 #include <cpu/dma-register.h>
 #include <cpu/sh7757.h>
+
+#include <asm/mmzone.h>
 #include <asm/platform_early.h>
 
 static struct plat_sci_port scif2_platform_data = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 74620f30b19b..c048842d8a58 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -400,20 +400,6 @@ static struct platform_device *sh7786_devices[] __initdata = {
 	&usb_ohci_device,
 };
 
-/*
- * Please call this function if your platform board
- * use external clock for USB
- * */
-#define USBCTL0		0xffe70858
-#define CLOCK_MODE_MASK 0xffffff7f
-#define EXT_CLOCK_MODE  0x00000080
-
-void __init sh7786_usb_use_exclock(void)
-{
-	u32 val = __raw_readl(USBCTL0) & CLOCK_MODE_MASK;
-	__raw_writel(val | EXT_CLOCK_MODE, USBCTL0);
-}
-
 #define USBINITREG1	0xffe70094
 #define USBINITREG2	0xffe7009c
 #define USBINITVAL1	0x00ff0040
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index bf8682e71830..45c8ae20d109 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -344,7 +344,7 @@ out:
  *	dwarf_lookup_fde - locate the FDE that covers pc
  *	@pc: the program counter
  */
-struct dwarf_fde *dwarf_lookup_fde(unsigned long pc)
+static struct dwarf_fde *dwarf_lookup_fde(unsigned long pc)
 {
 	struct rb_node **rb_node = &fde_root.rb_node;
 	struct dwarf_fde *fde = NULL;
diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c
index aed1ea8e2c2f..49c4ffd782d6 100644
--- a/arch/sh/kernel/kprobes.c
+++ b/arch/sh/kernel/kprobes.c
@@ -39,22 +39,17 @@ static DEFINE_PER_CPU(struct kprobe, saved_next_opcode2);
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
-	kprobe_opcode_t opcode = *(kprobe_opcode_t *) (p->addr);
+	kprobe_opcode_t opcode = *p->addr;
 
 	if (OPCODE_RTE(opcode))
 		return -EFAULT;	/* Bad breakpoint */
 
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	p->opcode = opcode;
 
 	return 0;
 }
 
-void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	p->opcode = *p->addr;
-}
-
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
@@ -253,7 +248,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	p = get_kprobe(addr);
 	if (!p) {
 		/* Not one of ours: let kernel handle it */
-		if (*(kprobe_opcode_t *)addr != BREAKPOINT_INSTRUCTION) {
+		if (*addr != BREAKPOINT_INSTRUCTION) {
 			/*
 			 * The breakpoint instruction was removed right
 			 * after we hit it. Another cpu has removed
@@ -301,7 +296,7 @@ static void __used kretprobe_trampoline_holder(void)
 /*
  * Called when we hit the probe point at __kretprobe_trampoline
  */
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	regs->pc = __kretprobe_trampoline_handler(regs, NULL);
 
diff --git a/arch/sh/kernel/return_address.c b/arch/sh/kernel/return_address.c
index 8838094c9ff9..2ce22f11eab3 100644
--- a/arch/sh/kernel/return_address.c
+++ b/arch/sh/kernel/return_address.c
@@ -7,7 +7,9 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
+
 #include <asm/dwarf.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_DWARF_UNWINDER
 
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 5cf35a774dc7..108d808767fa 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -21,6 +21,8 @@
 #include <linux/sched/hotplug.h>
 #include <linux/atomic.h>
 #include <linux/clockchips.h>
+#include <linux/profile.h>
+
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
 #include <asm/smp.h>
@@ -170,7 +172,7 @@ void native_play_dead(void)
 }
 #endif
 
-asmlinkage void start_secondary(void)
+static asmlinkage void start_secondary(void)
 {
 	unsigned int cpu = smp_processor_id();
 	struct mm_struct *mm = &init_mm;
@@ -320,11 +322,13 @@ void smp_message_recv(unsigned int msg)
 	}
 }
 
+#ifdef CONFIG_PROFILING
 /* Not really SMP stuff ... */
 int setup_profiling_timer(unsigned int multiplier)
 {
 	return 0;
 }
+#endif
 
 #ifdef CONFIG_MMU
 
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 01884054aeb2..4339c4cafa79 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -15,6 +15,8 @@
 
 #include <linux/extable.h>
 #include <linux/module.h>	/* print_modules */
+
+#include <asm/ftrace.h>
 #include <asm/unwinder.h>
 #include <asm/traps.h>
 
@@ -170,14 +172,6 @@ BUILD_TRAP_HANDLER(bug)
 	force_sig(SIGTRAP);
 }
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void arch_ftrace_nmi_enter(void);
-extern void arch_ftrace_nmi_exit(void);
-#else
-static inline void arch_ftrace_nmi_enter(void) { }
-static inline void arch_ftrace_nmi_exit(void) { }
-#endif
-
 BUILD_TRAP_HANDLER(nmi)
 {
 	TRAP_HANDLER_DECL;
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 6cdda3a621a1..1271b839a107 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -27,6 +27,7 @@
 #include <asm/alignment.h>
 #include <asm/fpu.h>
 #include <asm/kprobes.h>
+#include <asm/setup.h>
 #include <asm/traps.h>
 #include <asm/bl_bit.h>
 
@@ -568,7 +569,7 @@ uspace_segv:
 /*
  *	SH-DSP support gerg@snapgear.com.
  */
-int is_dsp_inst(struct pt_regs *regs)
+static int is_dsp_inst(struct pt_regs *regs)
 {
 	unsigned short inst = 0;
 
@@ -590,7 +591,7 @@ int is_dsp_inst(struct pt_regs *regs)
 	return 0;
 }
 #else
-#define is_dsp_inst(regs)	(0)
+static inline int is_dsp_inst(struct pt_regs *regs) { return 0; }
 #endif /* CONFIG_SH_DSP */
 
 #ifdef CONFIG_CPU_SH2A
diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S
index 3e07074e0098..06fed5a21e8b 100644
--- a/arch/sh/lib/checksum.S
+++ b/arch/sh/lib/checksum.S
@@ -33,7 +33,8 @@
  */
 
 /*	
- * asmlinkage __wsum csum_partial(const void *buf, int len, __wsum sum);
+ * unsigned int csum_partial(const unsigned char *buf, int len,
+ *                           unsigned int sum);
  */
 
 .text
@@ -45,31 +46,11 @@ ENTRY(csum_partial)
 	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
 	   * alignment for the unrolled loop.
 	   */
+	mov	r5, r1
 	mov	r4, r0
-	tst	#3, r0		! Check alignment.
-	bt/s	2f		! Jump if alignment is ok.
-	 mov	r4, r7		! Keep a copy to check for alignment
+	tst	#2, r0		! Check alignment.
+	bt	2f		! Jump if alignment is ok.
 	!
-	tst	#1, r0		! Check alignment.
-	bt	21f		! Jump if alignment is boundary of 2bytes.
-
-	! buf is odd
-	tst	r5, r5
-	add	#-1, r5
-	bt	9f
-	mov.b	@r4+, r0
-	extu.b	r0, r0
-	addc	r0, r6		! t=0 from previous tst
-	mov	r6, r0
-	shll8	r6
-	shlr16	r0
-	shlr8	r0
-	or	r0, r6
-	mov	r4, r0
-	tst	#2, r0
-	bt	2f
-21:
-	! buf is 2 byte aligned (len could be 0)
 	add	#-2, r5		! Alignment uses up two bytes.
 	cmp/pz	r5		!
 	bt/s	1f		! Jump if we had at least two bytes.
@@ -77,17 +58,16 @@ ENTRY(csum_partial)
 	bra	6f
 	 add	#2, r5		! r5 was < 2.  Deal with it.
 1:
+	mov	r5, r1		! Save new len for later use.
 	mov.w	@r4+, r0
 	extu.w	r0, r0
 	addc	r0, r6
 	bf	2f
 	add	#1, r6
 2:
-	! buf is 4 byte aligned (len could be 0)
-	mov	r5, r1
 	mov	#-5, r0
-	shld	r0, r1
-	tst	r1, r1
+	shld	r0, r5
+	tst	r5, r5
 	bt/s	4f		! if it's =0, go to 4f
 	 clrt
 	.align	2
@@ -109,31 +89,30 @@ ENTRY(csum_partial)
 	addc	r0, r6
 	addc	r2, r6
 	movt	r0
-	dt	r1
+	dt	r5
 	bf/s	3b
 	 cmp/eq	#1, r0
-	! here, we know r1==0
-	addc	r1, r6			! add carry to r6
+	! here, we know r5==0
+	addc	r5, r6			! add carry to r6
 4:
-	mov	r5, r0
+	mov	r1, r0
 	and	#0x1c, r0
 	tst	r0, r0
-	bt	6f
-	! 4 bytes or more remaining
-	mov	r0, r1
-	shlr2	r1
+	bt/s	6f
+	 mov	r0, r5
+	shlr2	r5
 	mov	#0, r2
 5:
 	addc	r2, r6
 	mov.l	@r4+, r2
 	movt	r0
-	dt	r1
+	dt	r5
 	bf/s	5b
 	 cmp/eq	#1, r0
 	addc	r2, r6
-	addc	r1, r6		! r1==0 here, so it means add carry-bit
+	addc	r5, r6		! r5==0 here, so it means add carry-bit
 6:
-	! 3 bytes or less remaining
+	mov	r1, r5
 	mov	#3, r0
 	and	r0, r5
 	tst	r5, r5
@@ -159,16 +138,6 @@ ENTRY(csum_partial)
 	mov	#0, r0
 	addc	r0, r6
 9:
-	! Check if the buffer was misaligned, if so realign sum
-	mov	r7, r0
-	tst	#1, r0
-	bt	10f
-	mov	r6, r0
-	shll8	r6
-	shlr16	r0
-	shlr8	r0
-	or	r0, r6
-10:
 	rts
 	 mov	r6, r0
 
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index cdaef6501d76..b65703e06573 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -15,6 +15,8 @@
 #include <linux/perf_event.h>
 
 #include <linux/uaccess.h>
+
+#include <asm/fpu.h>
 #include <asm/processor.h>
 #include <asm/io.h>
 
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 862046f26981..195e739ee2be 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -376,8 +376,6 @@ static void __flush_cache_one(unsigned long addr, unsigned long phys,
 	} while (--way_count != 0);
 }
 
-extern void __weak sh4__flush_region_init(void);
-
 /*
  * SH-4 has virtually indexed and physically tagged cache.
  */
diff --git a/arch/sh/mm/cache-shx3.c b/arch/sh/mm/cache-shx3.c
index 24c58b7dc022..dec039a75664 100644
--- a/arch/sh/mm/cache-shx3.c
+++ b/arch/sh/mm/cache-shx3.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/io.h>
 #include <asm/cache.h>
+#include <asm/cacheflush.h>
 
 #define CCR_CACHE_SNM	0x40000		/* Hardware-assisted synonym avoidance */
 #define CCR_CACHE_IBE	0x1000000	/* ICBI broadcast */
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index 9bcaa5619eab..ceffd3ffc81e 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -320,30 +320,20 @@ void __init cpu_cache_init(void)
 		goto skip;
 
 	if (boot_cpu_data.type == CPU_J2) {
-		extern void __weak j2_cache_init(void);
-
 		j2_cache_init();
 	} else if (boot_cpu_data.family == CPU_FAMILY_SH2) {
-		extern void __weak sh2_cache_init(void);
-
 		sh2_cache_init();
 	}
 
 	if (boot_cpu_data.family == CPU_FAMILY_SH2A) {
-		extern void __weak sh2a_cache_init(void);
-
 		sh2a_cache_init();
 	}
 
 	if (boot_cpu_data.family == CPU_FAMILY_SH3) {
-		extern void __weak sh3_cache_init(void);
-
 		sh3_cache_init();
 
 		if ((boot_cpu_data.type == CPU_SH7705) &&
 		    (boot_cpu_data.dcache.sets == 512)) {
-			extern void __weak sh7705_cache_init(void);
-
 			sh7705_cache_init();
 		}
 	}
@@ -351,14 +341,10 @@ void __init cpu_cache_init(void)
 	if ((boot_cpu_data.family == CPU_FAMILY_SH4) ||
 	    (boot_cpu_data.family == CPU_FAMILY_SH4A) ||
 	    (boot_cpu_data.family == CPU_FAMILY_SH4AL_DSP)) {
-		extern void __weak sh4_cache_init(void);
-
 		sh4_cache_init();
 
 		if ((boot_cpu_data.type == CPU_SH7786) ||
 		    (boot_cpu_data.type == CPU_SHX3)) {
-			extern void __weak shx3_cache_init(void);
-
 			shx3_cache_init();
 		}
 	}
diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c
index 78c4b6e6d33b..fa3dc9428a73 100644
--- a/arch/sh/mm/nommu.c
+++ b/arch/sh/mm/nommu.c
@@ -10,6 +10,8 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+
+#include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
 #include <linux/uaccess.h>
diff --git a/arch/sh/mm/pgtable.c b/arch/sh/mm/pgtable.c
index cf7ce4b57359..3a4085ea0161 100644
--- a/arch/sh/mm/pgtable.c
+++ b/arch/sh/mm/pgtable.c
@@ -2,12 +2,14 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 
+#include <asm/pgalloc.h>
+
 static struct kmem_cache *pgd_cachep;
 #if PAGETABLE_LEVELS > 2
 static struct kmem_cache *pmd_cachep;
 #endif
 
-void pgd_ctor(void *x)
+static void pgd_ctor(void *x)
 {
 	pgd_t *pgd = x;
 
diff --git a/arch/sh/mm/tlbex_32.c b/arch/sh/mm/tlbex_32.c
index 1c53868632ee..7d58578c15f4 100644
--- a/arch/sh/mm/tlbex_32.c
+++ b/arch/sh/mm/tlbex_32.c
@@ -14,6 +14,7 @@
 #include <linux/kdebug.h>
 #include <asm/mmu_context.h>
 #include <asm/thread_info.h>
+#include <asm/tlb.h>
 
 /*
  * Called with interrupts disabled.
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index fa0759bfe498..73bf0aea8baf 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1602,7 +1602,11 @@ skip_init_ctx:
 	bpf_flush_icache(header, (u8 *)header + header->size);
 
 	if (!prog->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(header);
+		if (bpf_jit_binary_lock_ro(header)) {
+			bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_off;
+		}
 	} else {
 		jit_data->ctx = ctx;
 		jit_data->image = image_ptr;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 965d65edbae0..bc47bc9841ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -170,6 +170,7 @@ config X86
 	select GENERIC_TIME_VSYSCALL
 	select GENERIC_GETTIMEOFDAY
 	select GENERIC_VDSO_TIME_NS
+	select GENERIC_VDSO_OVERFLOW_PROTECT
 	select GUP_GET_PXX_LOW_HIGH		if X86_PAE
 	select HARDIRQS_SW_RESEND
 	select HARDLOCKUP_CHECK_TIMESTAMP	if X86_64
@@ -466,6 +467,17 @@ config X86_X2APIC
 
 	  If you don't know what to do here, say N.
 
+config X86_POSTED_MSI
+	bool "Enable MSI and MSI-x delivery by posted interrupts"
+	depends on X86_64 && IRQ_REMAP
+	help
+	  This enables MSIs that are under interrupt remapping to be delivered as
+	  posted interrupts to the host kernel. Interrupt throughput can
+	  potentially be improved by coalescing CPU notifications during high
+	  frequency bursts.
+
+	  If you don't know what to do here, say N.
+
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9..0457a9d7e515 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -335,26 +335,6 @@ finish:
 		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
 }
 
-static void enforce_vmpl0(void)
-{
-	u64 attrs;
-	int err;
-
-	/*
-	 * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
-	 * higher) privilege level. Here, clear the VMPL1 permission mask of the
-	 * GHCB page. If the guest is not running at VMPL0, this will fail.
-	 *
-	 * If the guest is running at VMPL0, it will succeed. Even if that operation
-	 * modifies permission bits, it is still ok to do so currently because Linux
-	 * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
-	 * changing is a don't-care.
-	 */
-	attrs = 1;
-	if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
-		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
-}
-
 /*
  * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
  * guest side implementation for proper functioning of the guest. If any
@@ -413,6 +393,85 @@ void snp_check_features(void)
 	}
 }
 
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+	unsigned long cfg_table_pa;
+	unsigned int cfg_table_len;
+	int ret;
+
+	ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+	if (ret)
+		return NULL;
+
+	return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+								cfg_table_len,
+								EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+	struct cc_blob_sev_info *cc_info;
+
+	cc_info = find_cc_blob_efi(bp);
+	if (cc_info)
+		goto found_cc_info;
+
+	cc_info = find_cc_blob_setup_data(bp);
+	if (!cc_info)
+		return NULL;
+
+found_cc_info:
+	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+	return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+	struct cc_blob_sev_info *cc_info;
+
+	if (!bp)
+		return false;
+
+	cc_info = find_cc_blob(bp);
+	if (!cc_info)
+		return false;
+
+	/*
+	 * If a SNP-specific Confidential Computing blob is present, then
+	 * firmware/bootloader have indicated SNP support. Verifying this
+	 * involves CPUID checks which will be more reliable if the SNP
+	 * CPUID table is used. See comments over snp_setup_cpuid_table() for
+	 * more details.
+	 */
+	setup_cpuid_table(cc_info);
+
+	/*
+	 * Pass run-time kernel a pointer to CC info via boot_params so EFI
+	 * config table doesn't need to be searched again during early startup
+	 * phase.
+	 */
+	bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+	return true;
+}
+
 /*
  * sev_check_cpu_support - Check for SEV support in the CPU capabilities
  *
@@ -463,7 +522,7 @@ void sev_enable(struct boot_params *bp)
 		bp->cc_blob_address = 0;
 
 	/*
-	 * Do an initial SEV capability check before snp_init() which
+	 * Do an initial SEV capability check before early_snp_init() which
 	 * loads the CPUID page and the same checks afterwards are done
 	 * without the hypervisor and are trustworthy.
 	 *
@@ -478,7 +537,7 @@ void sev_enable(struct boot_params *bp)
 	 * Setup/preliminary detection of SNP. This will be sanity-checked
 	 * against CPUID/MSR values later.
 	 */
-	snp = snp_init(bp);
+	snp = early_snp_init(bp);
 
 	/* Now repeat the checks with the SNP CPUID table. */
 
@@ -509,7 +568,20 @@ void sev_enable(struct boot_params *bp)
 		if (!(get_hv_features() & GHCB_HV_FT_SNP))
 			sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
 
-		enforce_vmpl0();
+		/*
+		 * Enforce running at VMPL0.
+		 *
+		 * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
+		 * higher) privilege level. Here, clear the VMPL1 permission mask of the
+		 * GHCB page. If the guest is not running at VMPL0, this will fail.
+		 *
+		 * If the guest is running at VMPL0, it will succeed. Even if that operation
+		 * modifies permission bits, it is still ok to do so currently because Linux
+		 * SNP guests running at VMPL0 only run at VMPL0, so VMPL1 or higher
+		 * permission mask changes are a don't-care.
+		 */
+		if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1))
+			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
 	}
 
 	if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
@@ -535,85 +607,6 @@ u64 sev_get_status(void)
 	return m.q;
 }
 
-/* Search for Confidential Computing blob in the EFI config table. */
-static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
-{
-	unsigned long cfg_table_pa;
-	unsigned int cfg_table_len;
-	int ret;
-
-	ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
-	if (ret)
-		return NULL;
-
-	return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
-								cfg_table_len,
-								EFI_CC_BLOB_GUID);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the boot kernel
- * by firmware/bootloader in the following ways:
- *
- * - via an entry in the EFI config table
- * - via a setup_data structure, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
-	struct cc_blob_sev_info *cc_info;
-
-	cc_info = find_cc_blob_efi(bp);
-	if (cc_info)
-		goto found_cc_info;
-
-	cc_info = find_cc_blob_setup_data(bp);
-	if (!cc_info)
-		return NULL;
-
-found_cc_info:
-	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
-		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-
-	return cc_info;
-}
-
-/*
- * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
- * will verify the SNP CPUID/MSR bits.
- */
-bool snp_init(struct boot_params *bp)
-{
-	struct cc_blob_sev_info *cc_info;
-
-	if (!bp)
-		return false;
-
-	cc_info = find_cc_blob(bp);
-	if (!cc_info)
-		return false;
-
-	/*
-	 * If a SNP-specific Confidential Computing blob is present, then
-	 * firmware/bootloader have indicated SNP support. Verifying this
-	 * involves CPUID checks which will be more reliable if the SNP
-	 * CPUID table is used. See comments over snp_setup_cpuid_table() for
-	 * more details.
-	 */
-	setup_cpuid_table(cc_info);
-
-	/*
-	 * Pass run-time kernel a pointer to CC info via boot_params so EFI
-	 * config table doesn't need to be searched again during early startup
-	 * phase.
-	 */
-	bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
-	return true;
-}
-
 void sev_prep_identity_maps(unsigned long top_level_pgt)
 {
 	/*
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 89c1476fcdd9..f004a4dc74c2 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
 	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
 	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
 	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
+
+	SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	posted_msi_notification),
 };
 
 static bool fred_setup_done __initdata;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index f896eed4516c..5af926c050f0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -56,6 +56,8 @@ static inline void disable_acpi(void)
 
 extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
 
+extern int acpi_blacklisted(void);
+
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
 static inline void acpi_disable_pci(void)
 {
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0cb2396de066..ba99ef75f56c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -286,20 +286,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags)	\
 		: : "i" (0), ## input)
 
-/*
- * This is similar to alternative_input. But it has two features and
- * respective instructions.
- *
- * If CPU has feature2, newinstr2 is used.
- * Otherwise, if CPU has feature1, newinstr1 is used.
- * Otherwise, oldinstr is used.
- */
-#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2,	\
-			   ft_flags2, input...)				\
-	asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
-		newinstr2, ft_flags2)					\
-		: : "i" (0), ## input)
-
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, ft_flags, output, input...)	\
 	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags)	\
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 839c58f95114..9327eb00e96d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -14,6 +14,7 @@
 #include <asm/msr.h>
 #include <asm/hardirq.h>
 #include <asm/io.h>
+#include <asm/posted_intr.h>
 
 #define ARCH_APICTIMER_STOPS_ON_C3	1
 
@@ -500,6 +501,11 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
 	return !!(irr & (1U << (vector % 32)));
 }
 
+static inline bool is_vector_pending(unsigned int vector)
+{
+	return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
+}
+
 /*
  * Warm reset vector position:
  */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b96f9cabea0c..0b9611da6c53 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -149,8 +149,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 extern void setup_clear_cpu_cap(unsigned int bit);
 extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 
-#define setup_force_cpu_cap(bit) do { \
-	set_cpu_cap(&boot_cpu_data, bit);	\
+#define setup_force_cpu_cap(bit) do {			\
+							\
+	if (!boot_cpu_has(bit))				\
+		WARN_ON(alternatives_patched);		\
+							\
+	set_cpu_cap(&boot_cpu_data, bit);		\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index fbc7722b87d1..c67fa6ad098a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,10 +44,16 @@ typedef struct {
 	unsigned int irq_hv_reenlightenment_count;
 	unsigned int hyperv_stimer0_count;
 #endif
+#ifdef CONFIG_X86_POSTED_MSI
+	unsigned int posted_msi_notification_count;
+#endif
 } ____cacheline_aligned irq_cpustat_t;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
+#ifdef CONFIG_X86_POSTED_MSI
+DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+#endif
 #define __ARCH_IRQ_STAT
 
 #define inc_irq_stat(member)	this_cpu_inc(irq_stat.member)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 749c7411d2f1..d4f24499b256 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested
 # define fred_sysvec_kvm_posted_intr_nested_ipi		NULL
 #endif
 
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification		NULL
+# endif
+
 #if IS_ENABLED(CONFIG_HYPERV)
 DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,	sysvec_hyperv_callback);
 DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR,	sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 7a2ed154a5e1..5036f13ab69f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
 	return x86_vector_domain;
 }
 
+extern bool enable_posted_msi;
+
+static inline bool posted_msi_supported(void)
+{
+	return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
 #else  /* CONFIG_IRQ_REMAP */
 
 static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index d18bfb238f66..13aea8fc3d45 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -97,10 +97,16 @@
 
 #define LOCAL_TIMER_VECTOR		0xec
 
+/*
+ * Posted interrupt notification vector for all device MSIs delivered to
+ * the host kernel.
+ */
+#define POSTED_MSI_NOTIFICATION_VECTOR	0xeb
+
 #define NR_VECTORS			 256
 
 #ifdef CONFIG_X86_LOCAL_APIC
-#define FIRST_SYSTEM_VECTOR		LOCAL_TIMER_VECTOR
+#define FIRST_SYSTEM_VECTOR		POSTED_MSI_NOTIFICATION_VECTOR
 #else
 #define FIRST_SYSTEM_VECTOR		NR_VECTORS
 #endif
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c72c7ff78fcd..d593e52e6635 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -16,10 +16,10 @@ extern int pic_mode;
  * Summit or generic (i.e. installer) kernels need lots of bus entries.
  * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
  */
-#if CONFIG_BASE_SMALL == 0
-# define MAX_MP_BUSSES		260
-#else
+#ifdef CONFIG_BASE_SMALL
 # define MAX_MP_BUSSES		32
+#else
+# define MAX_MP_BUSSES		260
 #endif
 
 #define MAX_IRQ_SOURCES		256
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
new file mode 100644
index 000000000000..de788b400fba
--- /dev/null
+++ b/arch/x86/include/asm/posted_intr.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_POSTED_INTR_H
+#define _X86_POSTED_INTR_H
+#include <asm/irq_vectors.h>
+
+#define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
+#define PID_TABLE_ENTRY_VALID 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	union {
+		u32 pir[8];     /* Posted interrupt requested */
+		u64 pir64[4];
+	};
+	union {
+		struct {
+			u16	notifications; /* Suppress and outstanding bits */
+			u8	nv;
+			u8	rsvd_2;
+			u32	ndst;
+		};
+		u64 control;
+	};
+	u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+	return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+	set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+	set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_on(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+/* Non-atomic helpers */
+static inline void __pi_set_sn(struct pi_desc *pi_desc)
+{
+	pi_desc->notifications |= BIT(POSTED_INTR_SN);
+}
+
+static inline void __pi_clear_sn(struct pi_desc *pi_desc)
+{
+	pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
+}
+
+#ifdef CONFIG_X86_POSTED_MSI
+/*
+ * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's
+ * own interrupts. Here we do not distinguish them since those vector bits in
+ * PIR will always be zero.
+ */
+static inline bool pi_pending_this_cpu(unsigned int vector)
+{
+	struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+	if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
+		return false;
+
+	return test_bit(vector, (unsigned long *)pid->pir);
+}
+
+extern void intel_posted_msi_init(void);
+#else
+static inline bool pi_pending_this_cpu(unsigned int vector) { return false; }
+
+static inline void intel_posted_msi_init(void) {};
+#endif /* X86_POSTED_MSI */
+
+#endif /* _X86_POSTED_INTR_H */
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 93ed60080cfe..ca20cc4e5826 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -140,7 +140,7 @@ struct secrets_os_area {
 #define VMPCK_KEY_LEN		32
 
 /* See the SNP spec version 0.9 for secrets page format */
-struct snp_secrets_page_layout {
+struct snp_secrets_page {
 	u32 version;
 	u32 imien	: 1,
 	    rsvd1	: 31;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1be13b2dfe8b..64df897c0ee3 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,8 +37,6 @@ extern int phys_to_target_node(phys_addr_t start);
 #define phys_to_target_node phys_to_target_node
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-extern int numa_fill_memblks(u64 start, u64 end);
-#define numa_fill_memblks numa_fill_memblks
 #endif
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 345aafbc1964..6259f1937fe7 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -15,7 +15,7 @@
 
 extern void text_poke_early(void *addr, const void *opcode, size_t len);
 
-extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
 
 /*
  * Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 8e048ca980df..0ef36190abe6 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
 #define vdso_cycles_ok arch_vdso_cycles_ok
 
 /*
- * x86 specific delta calculation.
+ * x86 specific calculation of nanoseconds for the current cycle count
  *
  * The regular implementation assumes that clocksource reads are globally
  * monotonic. The TSC can be slightly off across sockets which can cause
@@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  * jump.
  *
  * Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
  *
  * This variant also uses a custom mask because while the clocksource mask of
  * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
@@ -317,25 +317,37 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  * declares everything with the MSB/Sign-bit set as invalid. Therefore the
  * effective mask is S64_MAX.
  */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
 {
-	/*
-	 * Due to the MSB/Sign-bit being used as invalid marker (see
-	 * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
-	 */
-	u64 delta = (cycles - last) & S64_MAX;
+	u64 delta = cycles - vd->cycle_last;
 
 	/*
-	 * Due to the above mentioned TSC wobbles, filter out negative motion.
-	 * Per the above masking, the effective sign bit is now bit 62.
+	 * Negative motion and deltas which can cause multiplication
+	 * overflow require special treatment. This check covers both as
+	 * negative motion is guaranteed to be greater than @vd::max_cycles
+	 * due to unsigned comparison.
+	 *
+	 * Due to the MSB/Sign-bit being used as invalid marker (see
+	 * arch_vdso_cycles_valid() above), the effective mask is S64_MAX,
+	 * but that case is also unlikely and will also take the unlikely path
+	 * here.
 	 */
-	if (unlikely(delta & (1ULL << 62)))
-		return 0;
+	if (unlikely(delta > vd->max_cycles)) {
+		/*
+		 * Due to the above mentioned TSC wobbles, filter out
+		 * negative motion.  Per the above masking, the effective
+		 * sign bit is now bit 62.
+		 */
+		if (delta & (1ULL << 62))
+			return base >> vd->shift;
+
+		/* Handle multiplication overflow gracefully */
+		return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift);
+	}
 
-	return delta * mult;
+	return ((delta * vd->mult) + base) >> vd->shift;
 }
-#define vdso_calc_delta vdso_calc_delta
+#define vdso_calc_ns vdso_calc_ns
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45a280f2161c..7555c15b7183 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -125,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 };
 
 /*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
  * Fill the buffer with a single effective instruction of size @len.
  *
  * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
@@ -133,28 +147,28 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
  * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
  * *jump* over instead of executing long and daft NOPs.
  */
-static void add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
 {
-	u8 *target = instr + len;
+	u8 *target = buf + len;
 
 	if (!len)
 		return;
 
 	if (len <= ASM_NOP_MAX) {
-		memcpy(instr, x86_nops[len], len);
+		memcpy(buf, x86_nops[len], len);
 		return;
 	}
 
 	if (len < 128) {
-		__text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
-		instr += JMP8_INSN_SIZE;
+		__text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+		buf += JMP8_INSN_SIZE;
 	} else {
-		__text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
-		instr += JMP32_INSN_SIZE;
+		__text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+		buf += JMP32_INSN_SIZE;
 	}
 
-	for (;instr < target; instr++)
-		*instr = INT3_INSN_OPCODE;
+	for (;buf < target; buf++)
+		*buf = INT3_INSN_OPCODE;
 }
 
 extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn)
  * Find the offset of the first non-NOP instruction starting at @offset
  * but no further than @len.
  */
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
 {
 	struct insn insn;
 
 	for (; offset < len; offset += insn.length) {
-		if (insn_decode_kernel(&insn, &instr[offset]))
+		if (insn_decode_kernel(&insn, &buf[offset]))
 			break;
 
 		if (!insn_is_nop(&insn))
@@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len)
 }
 
 /*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
-	int i = *next - insn->length;
-
-	switch (insn->opcode.bytes[0]) {
-	case JMP8_INSN_OPCODE:
-	case JMP32_INSN_OPCODE:
-		*prev = i;
-		*target = *next + insn->immediate.value;
-		return false;
-	}
-
-	if (insn_is_nop(insn)) {
-		int nop = i;
-
-		*next = skip_nops(instr, *next, len);
-		if (*target && *next == *target)
-			nop = *prev;
-
-		add_nop(instr + nop, *next - nop);
-		DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
-		return true;
-	}
-
-	*target = 0;
-	return false;
-}
-
-/*
  * "noinline" to cause control flow change and thus invalidate I$ and
  * cause refetch after modification.
  */
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
 {
-	int prev, target = 0;
-
 	for (int next, i = 0; i < len; i = next) {
 		struct insn insn;
 
-		if (insn_decode_kernel(&insn, &instr[i]))
+		if (insn_decode_kernel(&insn, &buf[i]))
 			return;
 
 		next = i + insn.length;
 
-		__optimize_nops(instr, len, &insn, &next, &prev, &target);
-	}
-}
+		if (insn_is_nop(&insn)) {
+			int nop = i;
 
-static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
-{
-	unsigned long flags;
+			/* Has the NOP already been optimized? */
+			if (i + insn.length == len)
+				return;
 
-	local_irq_save(flags);
-	optimize_nops(instr, len);
-	sync_core();
-	local_irq_restore(flags);
+			next = skip_nops(buf, next, len);
+
+			add_nop(buf + nop, next - nop);
+			DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+		}
+	}
 }
 
 /*
@@ -335,11 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
 	return (target < src || target > src + src_len);
 }
 
-void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
 {
-	int prev, target = 0;
-
-	for (int next, i = 0; i < len; i = next) {
+	for (int next, i = 0; i < instrlen; i = next) {
 		struct insn insn;
 
 		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -347,9 +325,6 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 
 		next = i + insn.length;
 
-		if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
-			continue;
-
 		switch (insn.opcode.bytes[0]) {
 		case 0x0f:
 			if (insn.opcode.bytes[1] < 0x80 ||
@@ -361,10 +336,10 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 		case JMP8_INSN_OPCODE:
 		case JMP32_INSN_OPCODE:
 		case CALL_INSN_OPCODE:
-			if (need_reloc(next + insn.immediate.value, src, src_len)) {
+			if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
 				apply_reloc(insn.immediate.nbytes,
 					    buf + i + insn_offset_immediate(&insn),
-					    src - dest);
+					    repl - instr);
 			}
 
 			/*
@@ -372,7 +347,7 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 			 */
 			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
 				s32 imm = insn.immediate.value;
-				imm += src - dest;
+				imm += repl - instr;
 				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
 				if ((imm >> 31) == (imm >> 7)) {
 					buf[i+0] = JMP8_INSN_OPCODE;
@@ -385,15 +360,21 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 		}
 
 		if (insn_rip_relative(&insn)) {
-			if (need_reloc(next + insn.displacement.value, src, src_len)) {
+			if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
 				apply_reloc(insn.displacement.nbytes,
 					    buf + i + insn_offset_displacement(&insn),
-					    src - dest);
+					    repl - instr);
 			}
 		}
 	}
 }
 
+void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+	__apply_relocation(buf, instr, instrlen, repl, repl_len);
+	optimize_nops(instr, buf, repl_len);
+}
+
 /* Low-level backend functions usable from alternative code replacements. */
 DEFINE_ASM_FUNC(nop_func, "", .entry.text);
 EXPORT_SYMBOL_GPL(nop_func);
@@ -464,9 +445,9 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 						  struct alt_instr *end)
 {
-	struct alt_instr *a;
-	u8 *instr, *replacement;
 	u8 insn_buff[MAX_PATCH_LEN];
+	u8 *instr, *replacement;
+	struct alt_instr *a;
 
 	DPRINTK(ALT, "alt table %px, -> %px", start, end);
 
@@ -504,7 +485,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		 *   patch if feature is *NOT* present.
 		 */
 		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
-			optimize_nops_inplace(instr, a->instrlen);
+			memcpy(insn_buff, instr, a->instrlen);
+			optimize_nops(instr, insn_buff, a->instrlen);
+			text_poke_early(instr, insn_buff, a->instrlen);
 			continue;
 		}
 
@@ -526,7 +509,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 			insn_buff[insn_buff_sz] = 0x90;
 
-		apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+		apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
 
 		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
 		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
@@ -761,7 +744,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 
 		len = patch_retpoline(addr, &insn, bytes);
 		if (len == insn.length) {
-			optimize_nops(bytes, len);
+			optimize_nops(addr, bytes, len);
 			DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
 			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
 			text_poke_early(addr, bytes, len);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0d22aefbde7f..66fd4b2a37a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -631,7 +631,7 @@ void lapic_update_tsc_freq(void)
 static __initdata int lapic_cal_loops = -1;
 static __initdata long lapic_cal_t1, lapic_cal_t2;
 static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
 static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
 
 /*
@@ -641,7 +641,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 {
 	unsigned long long tsc = 0;
 	long tapic = apic_read(APIC_TMCCT);
-	unsigned long pm = acpi_pm_read_early();
+	u32 pm = acpi_pm_read_early();
 
 	if (boot_cpu_has(X86_FEATURE_TSC))
 		tsc = rdtsc();
@@ -666,7 +666,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 }
 
 static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
 {
 	const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
 	const long pm_thresh = pm_100ms / 100;
@@ -677,7 +677,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
 	return -1;
 #endif
 
-	apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+	apic_printk(APIC_VERBOSE, "... PM-Timer delta = %u\n", deltapm);
 
 	/* Check, if the PM timer is available */
 	if (!deltapm)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..9eec52925fa3 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
 	lockdep_assert_held(&vector_lock);
 
 	hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
-		unsigned int irr, vector = apicd->prev_vector;
+		unsigned int vector = apicd->prev_vector;
 
 		/*
 		 * Paranoia: Check if the vector that needs to be cleaned
@@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
 		 * fixup_irqs() was just called to scan IRR for set bits and
 		 * forward them to new destination CPUs via IPIs.
 		 */
-		irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0;
-		if (irr & (1U << (vector % 32))) {
+		if (check_irr && is_vector_pending(vector)) {
 			pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
 			rearm = true;
 			continue;
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index e92ff0c11db8..465647456753 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -185,8 +185,7 @@ static void *patch_dest(void *dest, bool direct)
 	u8 *pad = dest - tsize;
 
 	memcpy(insn_buff, skl_call_thunk_template, tsize);
-	apply_relocation(insn_buff, tsize, pad,
-			 skl_call_thunk_template, tsize);
+	apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
 
 	/* Already patched? */
 	if (!bcmp(pad, insn_buff, tsize))
@@ -308,8 +307,7 @@ static bool is_callthunk(void *addr)
 	pad = (void *)(dest - tmpl_size);
 
 	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
-	apply_relocation(insn_buff, tmpl_size, pad,
-			 skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
 
 	return !bcmp(pad, insn_buff, tmpl_size);
 }
@@ -327,8 +325,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
 		return 0;
 
 	memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
-	apply_relocation(insn_buff, tmpl_size, ip,
-			 skl_call_thunk_template, tmpl_size);
+	apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
 
 	memcpy(*pprog, insn_buff, tmpl_size);
 	*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cdaa795a9371..2b170da84f97 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -68,6 +68,7 @@
 #include <asm/traps.h>
 #include <asm/sev.h>
 #include <asm/tdx.h>
+#include <asm/posted_intr.h>
 
 #include "cpu.h"
 
@@ -2222,6 +2223,8 @@ void cpu_init(void)
 		barrier();
 
 		x2apic_setup();
+
+		intel_posted_msi_init();
 	}
 
 	mmgrab(&init_mm);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 946813d816bf..b7d9f530ae16 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -114,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
 	if (WARN_ON(feature >= MAX_FEATURE_BITS))
 		return;
 
+	if (boot_cpu_has(feature))
+		WARN_ON(alternatives_patched);
+
 	clear_feature(c, feature);
 
 	/* Collect all features to disable, handling dependencies */
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..4284749ec803 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
  * used to save error information organized in a lock-less list.
  *
  * This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
  */
-#define MCE_POOLSZ	(2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES	80
+#define MCE_PER_CPU	2
 
 static struct gen_pool *mce_evt_pool;
 static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
 
 /*
  * Compare the record "t" with each of the records on list "l" to see if
@@ -118,22 +118,32 @@ int mce_gen_pool_add(struct mce *mce)
 
 static int mce_gen_pool_create(void)
 {
-	struct gen_pool *tmpp;
+	int mce_numrecords, mce_poolsz, order;
+	struct gen_pool *gpool;
 	int ret = -ENOMEM;
-
-	tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
-	if (!tmpp)
-		goto out;
-
-	ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+	void *mce_pool;
+
+	order = order_base_2(sizeof(struct mce_evt_llist));
+	gpool = gen_pool_create(order, -1);
+	if (!gpool)
+		return ret;
+
+	mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+	mce_poolsz = mce_numrecords * (1 << order);
+	mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+	if (!mce_pool) {
+		gen_pool_destroy(gpool);
+		return ret;
+	}
+	ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
 	if (ret) {
-		gen_pool_destroy(tmpp);
-		goto out;
+		gen_pool_destroy(gpool);
+		kfree(mce_pool);
+		return ret;
 	}
 
-	mce_evt_pool = tmpp;
+	mce_evt_pool = gpool;
 
-out:
 	return ret;
 }
 
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806d..c0d56c02b8da 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -84,8 +84,6 @@ struct microcode_amd {
 	unsigned int			mpb[];
 };
 
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
 static struct equiv_cpu_table {
 	unsigned int num_entries;
 	struct equiv_cpu_entry *entry;
@@ -465,7 +463,7 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz
 	return !__apply_microcode_amd(mc);
 }
 
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
 {
 	char fw_name[36] = "amd-ucode/microcode_amd.bin";
 	struct firmware fw;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 232026a239a6..b3658d11e7b6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
  */
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 
-struct cpu_info_ctx {
-	struct cpu_signature	*cpu_sig;
-	int			err;
-};
-
 /*
  * Those patch levels cannot be updated to newer ones and thus should be final.
  */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 19b4fdb94a36..a113d9aba553 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -56,14 +56,9 @@ int max_name_width, max_data_width;
  */
 bool rdt_alloc_capable;
 
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
-		struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
-	      struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
 
 #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
 
@@ -309,12 +304,11 @@ static void rdt_get_cdp_l2_config(void)
 	rdt_get_cdp_config(RDT_RESOURCE_L2);
 }
 
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
 	unsigned int i;
-	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
 		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -334,25 +328,22 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 	return r->default_ctrl;
 }
 
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
-		struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
 	unsigned int i;
-	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	/*  Write the delay values for mba. */
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+		wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
 }
 
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
 	unsigned int i;
-	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
 		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -362,6 +353,8 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
 {
 	struct rdt_domain *d;
 
+	lockdep_assert_cpus_held();
+
 	list_for_each_entry(d, &r->domains, list) {
 		/* Find the domain that contains this CPU */
 		if (cpumask_test_cpu(cpu, &d->cpu_mask))
@@ -378,19 +371,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
 
 void rdt_ctrl_update(void *arg)
 {
+	struct rdt_hw_resource *hw_res;
 	struct msr_param *m = arg;
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
-	struct rdt_resource *r = m->res;
-	int cpu = smp_processor_id();
-	struct rdt_domain *d;
 
-	d = get_domain_from_cpu(cpu, r);
-	if (d) {
-		hw_res->msr_update(d, m, r);
-		return;
-	}
-	pr_warn_once("cpu %d not found in any domain for resource %s\n",
-		     cpu, r->name);
+	hw_res = resctrl_to_arch_res(m->res);
+	hw_res->msr_update(m);
 }
 
 /*
@@ -463,9 +448,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	hw_dom->ctrl_val = dc;
 	setup_default_ctrlval(r, dc);
 
+	m.res = r;
+	m.dom = d;
 	m.low = 0;
 	m.high = hw_res->num_closid;
-	hw_res->msr_update(d, &m, r);
+	hw_res->msr_update(&m);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 7997b47743a2..b7291f60399c 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -272,22 +272,6 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
 	}
 }
 
-static bool apply_config(struct rdt_hw_domain *hw_dom,
-			 struct resctrl_staged_config *cfg, u32 idx,
-			 cpumask_var_t cpu_mask)
-{
-	struct rdt_domain *dom = &hw_dom->d_resctrl;
-
-	if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
-		cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
-		hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
-		return true;
-	}
-
-	return false;
-}
-
 int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
 {
@@ -302,9 +286,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
 	hw_dom->ctrl_val[idx] = cfg_val;
 
 	msr_param.res = r;
+	msr_param.dom = d;
 	msr_param.low = idx;
 	msr_param.high = idx + 1;
-	hw_res->msr_update(d, &msr_param, r);
+	hw_res->msr_update(&msr_param);
 
 	return 0;
 }
@@ -315,48 +300,39 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
 	enum resctrl_conf_type t;
-	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
 	u32 idx;
 
 	/* Walking r->domains, ensure it can't race with cpuhp */
 	lockdep_assert_cpus_held();
 
-	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	msr_param.res = NULL;
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
+		msr_param.res = NULL;
 		for (t = 0; t < CDP_NUM_TYPES; t++) {
 			cfg = &hw_dom->d_resctrl.staged_config[t];
 			if (!cfg->have_new_ctrl)
 				continue;
 
 			idx = get_config_index(closid, t);
-			if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+			if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
 				continue;
+			hw_dom->ctrl_val[idx] = cfg->new_ctrl;
 
 			if (!msr_param.res) {
 				msr_param.low = idx;
 				msr_param.high = msr_param.low + 1;
 				msr_param.res = r;
+				msr_param.dom = d;
 			} else {
 				msr_param.low = min(msr_param.low, idx);
 				msr_param.high = max(msr_param.high, idx + 1);
 			}
 		}
+		if (msr_param.res)
+			smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
 	}
 
-	if (cpumask_empty(cpu_mask))
-		goto done;
-
-	/* Update resource control msr on all the CPUs. */
-	on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
-	free_cpumask_var(cpu_mask);
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1a8687f8073a..f1d926832ec8 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -379,11 +379,13 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
 /**
  * struct msr_param - set a range of MSRs from a domain
  * @res:       The resource to use
+ * @dom:       The domain to update
  * @low:       Beginning index from base MSR
  * @high:      End index
  */
 struct msr_param {
 	struct rdt_resource	*res;
+	struct rdt_domain	*dom;
 	u32			low;
 	u32			high;
 };
@@ -443,8 +445,7 @@ struct rdt_hw_resource {
 	struct rdt_resource	r_resctrl;
 	u32			num_closid;
 	unsigned int		msr_base;
-	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
-				 struct rdt_resource *r);
+	void			(*msr_update)(struct msr_param *m);
 	unsigned int		mon_scale;
 	unsigned int		mbm_width;
 	unsigned int		mbm_cfg_mask;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c34a35ec0f03..2345e6836593 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -24,6 +24,7 @@
 #include <asm/resctrl.h>
 
 #include "internal.h"
+#include "trace.h"
 
 /**
  * struct rmid_entry - dirty tracking for all RMID.
@@ -354,6 +355,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 			rmid_dirty = true;
 		} else {
 			rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+			/*
+			 * x86's CLOSID and RMID are independent numbers, so the entry's
+			 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+			 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+			 * used to select the configuration. It is thus necessary to track both
+			 * CLOSID and RMID because there may be dependencies between them
+			 * on some architectures.
+			 */
+			trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val);
 		}
 
 		if (force_free || !rmid_dirty) {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 04584a76ceb4..aacf236dfe3b 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -31,7 +31,7 @@
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
 
 /*
  * The bits needed to disable hardware prefetching varies based on the
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 011e17efb1a6..02f213f1c51c 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2813,16 +2813,12 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
-	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
 	int i;
 
 	/* Walking r->domains, ensure it can't race with cpuhp */
 	lockdep_assert_cpus_held();
 
-	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
-		return -ENOMEM;
-
 	msr_param.res = r;
 	msr_param.low = 0;
 	msr_param.high = hw_res->num_closid;
@@ -2834,17 +2830,13 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	 */
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
-		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
 		for (i = 0; i < hw_res->num_closid; i++)
 			hw_dom->ctrl_val[i] = r->default_ctrl;
+		msr_param.dom = d;
+		smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
 	}
 
-	/* Update CBM on all the CPUs in cpu_mask */
-	on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-	free_cpumask_var(cpu_mask);
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b..2a506316b303 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM resctrl
 
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
 
 #include <linux/tracepoint.h>
 
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
 	    TP_printk("hits=%llu miss=%llu",
 		      __entry->l3_hits, __entry->l3_miss));
 
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+	    TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+	    TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+	    TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+			     __field(u32, mon_hw_id)
+			     __field(int, domain_id)
+			     __field(u64, llc_occupancy_bytes)),
+	    TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+			   __entry->mon_hw_id = mon_hw_id;
+			   __entry->domain_id = domain_id;
+			   __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+	    TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+		      __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+		      __entry->llc_occupancy_bytes)
+	   );
+
+#endif /* _TRACE_RESCTRL_H */
 
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
 #include <trace/define_trace.h>
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index fc37c8d83daf..f445bec516a0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
 # endif
 	INTG(SPURIOUS_APIC_VECTOR,		asm_sysvec_spurious_apic_interrupt),
 	INTG(ERROR_APIC_VECTOR,			asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+	INTG(POSTED_MSI_NOTIFICATION_VECTOR,	asm_sysvec_posted_msi_notification),
+# endif
 #endif
 };
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 35fde0107901..385e3a5fc304 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,8 @@
 #include <asm/desc.h>
 #include <asm/traps.h>
 #include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
@@ -182,6 +184,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
 	seq_puts(p, "  Posted-interrupt wakeup event\n");
 #endif
+#ifdef CONFIG_X86_POSTED_MSI
+	seq_printf(p, "%*s: ", prec, "PMN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->posted_msi_notification_count);
+	seq_puts(p, "  Posted MSI notification event\n");
+#endif
 	return 0;
 }
 
@@ -240,24 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
 		__handle_irq(desc, regs);
 }
 
-/*
- * common_interrupt() handles all normal device IRQ's (the special SMP
- * cross-CPU interrupts have their own entry points).
- */
-DEFINE_IDTENTRY_IRQ(common_interrupt)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct irq_desc *desc;
-
-	/* entry code tells RCU that we're not quiescent.  Check it. */
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+	int ret = 0;
 
 	desc = __this_cpu_read(vector_irq[vector]);
 	if (likely(!IS_ERR_OR_NULL(desc))) {
 		handle_irq(desc, regs);
 	} else {
-		apic_eoi();
-
+		ret = -EINVAL;
 		if (desc == VECTOR_UNUSED) {
 			pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
 					     __func__, smp_processor_id(),
@@ -267,6 +268,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
 		}
 	}
 
+	return ret;
+}
+
+/*
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
+ */
+DEFINE_IDTENTRY_IRQ(common_interrupt)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/* entry code tells RCU that we're not quiescent.  Check it. */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+	if (unlikely(call_irq_handler(vector, regs)))
+		apic_eoi();
+
 	set_irq_regs(old_regs);
 }
 
@@ -334,12 +352,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
 }
 #endif
 
+#ifdef CONFIG_X86_POSTED_MSI
+
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+
+void intel_posted_msi_init(void)
+{
+	u32 destination;
+	u32 apic_id;
+
+	this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
+
+	/*
+	 * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+	 * VT-d spec. CH 9.11
+	 */
+	apic_id = this_cpu_read(x86_cpu_to_apicid);
+	destination = x2apic_enabled() ? apic_id : apic_id << 8;
+	this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ *   accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ *   for checking and clearing posted interrupt request (PIR), a 256 bit field
+ *   within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ *   line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ *   cache line. The cache line states after each operation are as follows:
+ *   CPU		IOMMU			PID Cache line state
+ *   ---------------------------------------------------------------
+ *...read64					exclusive
+ *...lock xchg64				modified
+ *...			post/atomic swap	invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ *		read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ *		read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+	int i, vec = FIRST_EXTERNAL_VECTOR;
+	unsigned long pir_copy[4];
+	bool handled = false;
+
+	for (i = 0; i < 4; i++)
+		pir_copy[i] = pir[i];
+
+	for (i = 0; i < 4; i++) {
+		if (!pir_copy[i])
+			continue;
+
+		pir_copy[i] = arch_xchg(&pir[i], 0);
+		handled = true;
+	}
+
+	if (handled) {
+		for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+			call_irq_handler(vec, regs);
+	}
+
+	return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct pi_desc *pid;
+	int i = 0;
+
+	pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+	inc_irq_stat(posted_msi_notification_count);
+	irq_enter();
+
+	/*
+	 * Max coalescing count includes the extra round of handle_pending_pir
+	 * after clearing the outstanding notification bit. Hence, at most
+	 * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+	 */
+	while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+		if (!handle_pending_pir(pid->pir64, regs))
+			break;
+	}
+
+	/*
+	 * Clear outstanding notification bit to allow new IRQ notifications,
+	 * do this last to maximize the window of interrupt coalescing.
+	 */
+	pi_clear_on(pid);
+
+	/*
+	 * There could be a race of PI notification and the clearing of ON bit,
+	 * process PIR bits one last time such that handling the new interrupts
+	 * are not delayed until the next IRQ.
+	 */
+	handle_pending_pir(pid->pir64, regs);
+
+	apic_eoi();
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+#endif /* X86_POSTED_MSI */
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
 {
-	unsigned int irr, vector;
+	unsigned int vector;
 	struct irq_desc *desc;
 	struct irq_data *data;
 	struct irq_chip *chip;
@@ -366,8 +511,7 @@ void fixup_irqs(void)
 		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
 			continue;
 
-		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
-		if (irr  & (1 << (vector % 32))) {
+		if (is_vector_pending(vector)) {
 			desc = __this_cpu_read(vector_irq[vector]);
 
 			raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 995f94467101..3342ed58e168 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -648,7 +648,7 @@ static u64 __init get_secrets_page(void)
 
 static u64 __init get_snp_jump_table_addr(void)
 {
-	struct snp_secrets_page_layout *layout;
+	struct snp_secrets_page *secrets;
 	void __iomem *mem;
 	u64 pa, addr;
 
@@ -662,9 +662,9 @@ static u64 __init get_snp_jump_table_addr(void)
 		return 0;
 	}
 
-	layout = (__force struct snp_secrets_page_layout *)mem;
+	secrets = (__force struct snp_secrets_page *)mem;
 
-	addr = layout->os_area.ap_jump_table_pa;
+	addr = secrets->os_area.ap_jump_table_pa;
 	iounmap(mem);
 
 	return addr;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1123ef3ccf90..4334033658ed 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
 	cur->warned = false;
 
 	/*
-	 * If a non-zero TSC value for socket 0 may be valid then the default
-	 * adjusted value cannot assumed to be zero either.
+	 * The default adjust value cannot be assumed to be zero on any socket.
 	 */
-	if (tsc_async_resets)
-		cur->adjusted = bootval;
+	cur->adjusted = bootval;
 
 	/*
 	 * Check whether this CPU is the first in a package to come up. In
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index af662312fd07..ec08fa3caf43 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 		 * handle task migration (@cpu != vcpu->cpu).
 		 */
 		new.ndst = dest;
-		new.sn = 0;
+		__pi_clear_sn(&new);
 
 		/*
 		 * Restore the notification vector; in the blocking case, the
@@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
 	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 
-	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+	WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
 
 	old.control = READ_ONCE(pi_desc->control);
 	do {
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..6b2a0226257e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -1,98 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __KVM_X86_VMX_POSTED_INTR_H
 #define __KVM_X86_VMX_POSTED_INTR_H
-
-#define POSTED_INTR_ON  0
-#define POSTED_INTR_SN  1
-
-#define PID_TABLE_ENTRY_VALID 1
-
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
-	u32 pir[8];     /* Posted interrupt requested */
-	union {
-		struct {
-				/* bit 256 - Outstanding Notification */
-			u16	on	: 1,
-				/* bit 257 - Suppress Notification */
-				sn	: 1,
-				/* bit 271:258 - Reserved */
-				rsvd_1	: 14;
-				/* bit 279:272 - Notification Vector */
-			u8	nv;
-				/* bit 287:280 - Reserved */
-			u8	rsvd_2;
-				/* bit 319:288 - Notification Destination */
-			u32	ndst;
-		};
-		u64 control;
-	};
-	u32 rsvd[6];
-} __aligned(64);
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
-	return test_and_set_bit(POSTED_INTR_ON,
-			(unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
-	return test_and_clear_bit(POSTED_INTR_ON,
-			(unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
-{
-	return test_and_clear_bit(POSTED_INTR_SN,
-			(unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
-	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
-	return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
-	set_bit(POSTED_INTR_SN,
-		(unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
-	set_bit(POSTED_INTR_ON,
-		(unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
-	clear_bit(POSTED_INTR_ON,
-		(unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
-	clear_bit(POSTED_INTR_SN,
-		(unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_on(struct pi_desc *pi_desc)
-{
-	return test_bit(POSTED_INTR_ON,
-			(unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_sn(struct pi_desc *pi_desc)
-{
-	return test_bit(POSTED_INTR_SN,
-			(unsigned long *)&pi_desc->control);
-}
+#include <asm/posted_intr.h>
 
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 22411f4aff53..becefaf95cab 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -70,6 +70,7 @@
 #include "x86.h"
 #include "smm.h"
 #include "vmx_onhyperv.h"
+#include "posted_intr.h"
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -4844,7 +4845,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	 * or POSTED_INTR_WAKEUP_VECTOR.
 	 */
 	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
-	vmx->pi_desc.sn = 1;
+	__pi_set_sn(&vmx->pi_desc);
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 90f9e4434646..7e483366b31e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -7,10 +7,10 @@
 #include <asm/kvm.h>
 #include <asm/intel_pt.h>
 #include <asm/perf_event.h>
+#include <asm/posted_intr.h>
 
 #include "capabilities.h"
 #include "../kvm_cache_regs.h"
-#include "posted_intr.h"
 #include "vmcs.h"
 #include "vmx_ops.h"
 #include "../cpuid.h"
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..ce84ba86e69e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -929,6 +929,8 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
+#endif
+
 static int __init cmp_memblk(const void *a, const void *b)
 {
 	const struct numa_memblk *ma = *(const struct numa_memblk **)a;
@@ -1001,5 +1003,3 @@ int __init numa_fill_memblks(u64 start, u64 end)
 	}
 	return 0;
 }
-
-#endif
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 59cbc94b6e69..5159c7a22922 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -816,9 +816,10 @@ done:
 static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 			   const u32 imm32_hi, const u32 imm32_lo)
 {
+	u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo;
 	u8 *prog = *pprog;
 
-	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+	if (is_uimm32(imm64)) {
 		/*
 		 * For emitting plain u32, where sign bit must not be
 		 * propagated LLVM tends to load imm64 over mov32
@@ -826,6 +827,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 		 * 'mov %eax, imm32' instead.
 		 */
 		emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+	} else if (is_simm32(imm64)) {
+		emit_mov_imm32(&prog, true, dst_reg, imm32_lo);
 	} else {
 		/* movabsq rax, imm64 */
 		EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
@@ -1169,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 	return 0;
 }
 
+static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
+			     u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+	u8 *prog = *pprog;
+
+	EMIT1(0xF0); /* lock prefix */
+	switch (size) {
+	case BPF_W:
+		EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
+		break;
+	case BPF_DW:
+		EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg));
+		break;
+	default:
+		pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n");
+		return -EFAULT;
+	}
+
+	/* emit opcode */
+	switch (atomic_op) {
+	case BPF_ADD:
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
+		/* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */
+		EMIT1(simple_alu_opcodes[atomic_op]);
+		break;
+	case BPF_ADD | BPF_FETCH:
+		/* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */
+		EMIT2(0x0F, 0xC1);
+		break;
+	case BPF_XCHG:
+		/* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */
+		EMIT1(0x87);
+		break;
+	case BPF_CMPXCHG:
+		/* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */
+		EMIT2(0x0F, 0xB1);
+		break;
+	default:
+		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+		return -EFAULT;
+	}
+	emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+	*pprog = prog;
+	return 0;
+}
+
 #define DONT_CLEAR 1
 
 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
@@ -1351,8 +1402,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_X:
-			if (insn->off == BPF_ADDR_SPACE_CAST &&
-			    insn->imm == 1U << 16) {
+			if (insn_is_cast_user(insn)) {
 				if (dst_reg != src_reg)
 					/* 32-bit mov */
 					emit_mov_reg(&prog, false, dst_reg, src_reg);
@@ -1383,6 +1433,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 				maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
 				EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
 				break;
+			} else if (insn_is_mov_percpu_addr(insn)) {
+				/* mov <dst>, <src> (if necessary) */
+				EMIT_mov(dst_reg, src_reg);
+#ifdef CONFIG_SMP
+				/* add <dst>, gs:[<off>] */
+				EMIT2(0x65, add_1mod(0x48, dst_reg));
+				EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25);
+				EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+				break;
 			}
 			fallthrough;
 		case BPF_ALU | BPF_MOV | BPF_X:
@@ -1963,6 +2023,15 @@ populate_extable:
 				return err;
 			break;
 
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+			start_of_ldx = prog;
+			err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code),
+						dst_reg, src_reg, X86_REG_R12, insn->off);
+			if (err)
+				return err;
+			goto populate_extable;
+
 			/* call */
 		case BPF_JMP | BPF_CALL: {
 			u8 *ip = image + addrs[i - 1];
@@ -2994,12 +3063,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
 	bpf_prog_pack_free(image, size);
 }
 
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
+	return 0;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
@@ -3359,6 +3425,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
 	return true;
 }
 
+bool bpf_jit_supports_percpu_insn(void)
+{
+	return true;
+}
+
 void bpf_jit_free(struct bpf_prog *prog)
 {
 	if (prog->jited) {
@@ -3462,6 +3533,21 @@ bool bpf_jit_supports_arena(void)
 	return true;
 }
 
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	if (!in_arena)
+		return true;
+	switch (insn->code) {
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (insn->imm == (BPF_AND | BPF_FETCH) ||
+		    insn->imm == (BPF_OR | BPF_FETCH) ||
+		    insn->imm == (BPF_XOR | BPF_FETCH))
+			return false;
+	}
+	return true;
+}
+
 bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index c10083a8e68e..de0f9e5f9f73 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2600,8 +2600,7 @@ out_image:
 	if (bpf_jit_enable > 1)
 		bpf_jit_dump(prog->len, proglen, pass + 1, image);
 
-	if (image) {
-		bpf_jit_binary_lock_ro(header);
+	if (image && !bpf_jit_binary_lock_ro(header)) {
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
 		prog->jited_len = proglen;