104 files changed, 2572 insertions, 2346 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1b1a0e95c751..c03cd0d765d3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -75,6 +75,7 @@ config ARM64
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select CPU_PM if (SUSPEND || CPU_IDLE)
+	select CRC32
 	select DCACHE_WORD_ACCESS
 	select DMA_DIRECT_OPS
 	select EDAC_SUPPORT
@@ -104,6 +105,7 @@ config ARM64
 	select HAVE_ARCH_BITREVERSE
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
@@ -142,6 +144,7 @@ config ARM64
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_RCU_TABLE_INVALIDATE
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
@@ -479,6 +482,19 @@ config ARM64_ERRATUM_1024718
 
 	  If unsure, say Y.
 
+config ARM64_ERRATUM_1188873
+	bool "Cortex-A76: MRC read following MRRC read of specific Generic Timer in AArch32 might give incorrect result"
+	default y
+	select ARM_ARCH_TIMER_OOL_WORKAROUND
+	help
+	  This option adds work arounds for ARM Cortex-A76 erratum 1188873
+
+	  Affected Cortex-A76 cores (r0p0, r1p0, r2p0) could cause
+	  register corruption when accessing the timer registers from
+	  AArch32 userspace.
+
+	  If unsure, say Y.
+
 config CAVIUM_ERRATUM_22375
 	bool "Cavium erratum 22375, 24313"
 	default y
@@ -769,9 +785,6 @@ source kernel/Kconfig.hz
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
-config ARCH_HAS_HOLES_MEMORYMODEL
-	def_bool y if SPARSEMEM
-
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	select SPARSEMEM_VMEMMAP_ENABLE
@@ -786,7 +799,7 @@ config ARCH_FLATMEM_ENABLE
 	def_bool !NUMA
 
 config HAVE_ARCH_PFN_VALID
-	def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+	def_bool y
 
 config HW_PERF_EVENTS
 	def_bool y
@@ -1132,6 +1145,20 @@ config ARM64_RAS_EXTN
 	  and access the new registers if the system supports the extension.
 	  Platform RAS features may additionally depend on firmware support.
 
+config ARM64_CNP
+	bool "Enable support for Common Not Private (CNP) translations"
+	default y
+	depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN
+	help
+	  Common Not Private (CNP) allows translation table entries to
+	  be shared between different PEs in the same inner shareable
+	  domain, so the hardware can use this fact to optimise the
+	  caching of such entries in the TLB.
+
+	  Selecting this option allows the CNP feature to be detected
+	  at runtime, and does not affect PEs that do not implement
+	  this feature.
+
 endmenu
 
 config ARM64_SVE
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 393d2b524284..5a89a957641b 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -128,6 +128,7 @@ config ARCH_MVEBU
 	select MVEBU_ICU
 	select MVEBU_ODMI
 	select MVEBU_PIC
+	select MVEBU_SEI
 	select OF_GPIO
 	select PINCTRL
 	select PINCTRL_ARMADA_37XX
diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
index d033da401c26..fb3d2ee77c56 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
@@ -137,6 +137,8 @@
 			reset-names = "stmmaceth", "stmmaceth-ocp";
 			clocks = <&clkmgr STRATIX10_EMAC0_CLK>;
 			clock-names = "stmmaceth";
+			tx-fifo-depth = <16384>;
+			rx-fifo-depth = <16384>;
 			status = "disabled";
 		};
 
@@ -150,6 +152,8 @@
 			reset-names = "stmmaceth", "stmmaceth-ocp";
 			clocks = <&clkmgr STRATIX10_EMAC1_CLK>;
 			clock-names = "stmmaceth";
+			tx-fifo-depth = <16384>;
+			rx-fifo-depth = <16384>;
 			status = "disabled";
 		};
 
@@ -163,6 +167,8 @@
 			reset-names = "stmmaceth", "stmmaceth-ocp";
 			clocks = <&clkmgr STRATIX10_EMAC2_CLK>;
 			clock-names = "stmmaceth";
+			tx-fifo-depth = <16384>;
+			rx-fifo-depth = <16384>;
 			status = "disabled";
 		};
 
@@ -467,16 +473,51 @@
 			status = "disabled";
 		};
 
+		sdr: sdr@f8011100 {
+			compatible = "altr,sdr-ctl", "syscon";
+			reg = <0xf8011100 0xc0>;
+		};
+
 		eccmgr {
-			compatible = "altr,socfpga-s10-ecc-manager";
+			compatible = "altr,socfpga-a10-ecc-manager";
+			altr,sysmgr-syscon = <&sysmgr>;
+			#address-cells = <1>;
+			#size-cells = <1>;
 			interrupts = <0 15 4>, <0 95 4>;
 			interrupt-controller;
 			#interrupt-cells = <2>;
+			ranges;
 
 			sdramedac {
 				compatible = "altr,sdram-edac-s10";
+				altr,sdr-syscon = <&sdr>;
 				interrupts = <16 4>, <48 4>;
 			};
+
+			usb0-ecc@ff8c4000 {
+				compatible = "altr,socfpga-usb-ecc";
+				reg = <0xff8c4000 0x100>;
+				altr,ecc-parent = <&usb0>;
+				interrupts = <2 4>,
+					     <34 4>;
+			};
+
+			emac0-rx-ecc@ff8c0000 {
+				compatible = "altr,socfpga-eth-mac-ecc";
+				reg = <0xff8c0000 0x100>;
+				altr,ecc-parent = <&gmac0>;
+				interrupts = <4 4>,
+					     <36 4>;
+			};
+
+			emac0-tx-ecc@ff8c0400 {
+				compatible = "altr,socfpga-eth-mac-ecc";
+				reg = <0xff8c0400 0x100>;
+				altr,ecc-parent = <&gmac0>;
+				interrupts = <5 4>,
+					     <37 4>;
+			};
+
 		};
 
 		qspi: spi@ff8d2000 {
diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
index 6edc4fa9fd42..7c661753bfaf 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts
@@ -76,7 +76,7 @@
 	phy-mode = "rgmii";
 	phy-handle = <&phy0>;
 
-	max-frame-size = <3800>;
+	max-frame-size = <9000>;
 
 	mdio0 {
 		#address-cells = <1>;
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index db8d364f8476..3d165b4cdd2a 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -698,6 +698,7 @@ CONFIG_MEMTEST=y
 CONFIG_SECURITY=y
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_ANSI_CPRNG=y
+CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=y
 CONFIG_ARM64_CRYPTO=y
 CONFIG_CRYPTO_SHA1_ARM64_CE=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
@@ -706,7 +707,6 @@ CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
-CONFIG_CRYPTO_CRC32_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index e3fdb0fd6f70..a5606823ed4d 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -66,11 +66,6 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
-config CRYPTO_CRC32_ARM64_CE
-	tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
-	depends on CRC32
-	select CRYPTO_HASH
-
 config CRYPTO_AES_ARM64
 	tristate "AES core cipher using scalar instructions"
 	select CRYPTO_AES
@@ -119,10 +114,4 @@ config CRYPTO_AES_ARM64_BS
 	select CRYPTO_AES_ARM64
 	select CRYPTO_SIMD
 
-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_SPECK
-
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index bcafd016618e..f476fede09ba 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -32,9 +32,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
 
-obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
-crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
@@ -56,9 +53,6 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
 
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
 
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 623e74ed1c67..143070510809 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -17,6 +17,11 @@
 
 	.arch		armv8-a+crypto
 
+	xtsmask		.req	v16
+
+	.macro		xts_reload_mask, tmp
+	.endm
+
 	/* preload all round keys */
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index adcb83eb683c..1e676625ef33 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -15,6 +15,7 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
 #include <crypto/xts.h>
@@ -31,6 +32,8 @@
 #define aes_ecb_decrypt		ce_aes_ecb_decrypt
 #define aes_cbc_encrypt		ce_aes_cbc_encrypt
 #define aes_cbc_decrypt		ce_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt	ce_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt	ce_aes_cbc_cts_decrypt
 #define aes_ctr_encrypt		ce_aes_ctr_encrypt
 #define aes_xts_encrypt		ce_aes_xts_encrypt
 #define aes_xts_decrypt		ce_aes_xts_decrypt
@@ -45,6 +48,8 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #define aes_ecb_decrypt		neon_aes_ecb_decrypt
 #define aes_cbc_encrypt		neon_aes_cbc_encrypt
 #define aes_cbc_decrypt		neon_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt	neon_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt	neon_aes_cbc_cts_decrypt
 #define aes_ctr_encrypt		neon_aes_ctr_encrypt
 #define aes_xts_encrypt		neon_aes_xts_encrypt
 #define aes_xts_decrypt		neon_aes_xts_decrypt
@@ -63,30 +68,41 @@ MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 
 /* defined in aes-modes.S */
-asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
 				int rounds, int blocks);
-asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
 				int rounds, int blocks);
 
-asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
 				int rounds, int blocks, u8 iv[]);
-asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
 				int rounds, int blocks, u8 iv[]);
 
-asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+				int rounds, int bytes, u8 const iv[]);
+asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+				int rounds, int bytes, u8 const iv[]);
+
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
 				int rounds, int blocks, u8 ctr[]);
 
-asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
-				int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+				int rounds, int blocks, u32 const rk2[], u8 iv[],
 				int first);
-asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
-				int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+				int rounds, int blocks, u32 const rk2[], u8 iv[],
 				int first);
 
 asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
 			       int blocks, u8 dg[], int enc_before,
 			       int enc_after);
 
+struct cts_cbc_req_ctx {
+	struct scatterlist sg_src[2];
+	struct scatterlist sg_dst[2];
+	struct skcipher_request subreq;
+};
+
 struct crypto_aes_xts_ctx {
 	struct crypto_aes_ctx key1;
 	struct crypto_aes_ctx __aligned(8) key2;
@@ -142,7 +158,7 @@ static int ecb_encrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key_enc, rounds, blocks);
+				ctx->key_enc, rounds, blocks);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -162,7 +178,7 @@ static int ecb_decrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key_dec, rounds, blocks);
+				ctx->key_dec, rounds, blocks);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -182,7 +198,7 @@ static int cbc_encrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+				ctx->key_enc, rounds, blocks, walk.iv);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -202,13 +218,149 @@ static int cbc_decrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key_dec, rounds, blocks, walk.iv);
+				ctx->key_dec, rounds, blocks, walk.iv);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
 }
 
+static int cts_cbc_init_tfm(struct crypto_skcipher *tfm)
+{
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct cts_cbc_req_ctx));
+	return 0;
+}
+
+static int cts_cbc_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
+	int err, rounds = 6 + ctx->key_length / 4;
+	int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+	struct scatterlist *src = req->src, *dst = req->dst;
+	struct skcipher_walk walk;
+
+	skcipher_request_set_tfm(&rctx->subreq, tfm);
+
+	if (req->cryptlen <= AES_BLOCK_SIZE) {
+		if (req->cryptlen < AES_BLOCK_SIZE)
+			return -EINVAL;
+		cbc_blocks = 1;
+	}
+
+	if (cbc_blocks > 0) {
+		unsigned int blocks;
+
+		skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
+					   cbc_blocks * AES_BLOCK_SIZE,
+					   req->iv);
+
+		err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+
+		while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+			kernel_neon_begin();
+			aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key_enc, rounds, blocks, walk.iv);
+			kernel_neon_end();
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (err)
+			return err;
+
+		if (req->cryptlen == AES_BLOCK_SIZE)
+			return 0;
+
+		dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
+					     rctx->subreq.cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
+					       rctx->subreq.cryptlen);
+	}
+
+	/* handle ciphertext stealing */
+	skcipher_request_set_crypt(&rctx->subreq, src, dst,
+				   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+				   req->iv);
+
+	err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+	if (err)
+		return err;
+
+	kernel_neon_begin();
+	aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+			    ctx->key_enc, rounds, walk.nbytes, walk.iv);
+	kernel_neon_end();
+
+	return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
+	int err, rounds = 6 + ctx->key_length / 4;
+	int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+	struct scatterlist *src = req->src, *dst = req->dst;
+	struct skcipher_walk walk;
+
+	skcipher_request_set_tfm(&rctx->subreq, tfm);
+
+	if (req->cryptlen <= AES_BLOCK_SIZE) {
+		if (req->cryptlen < AES_BLOCK_SIZE)
+			return -EINVAL;
+		cbc_blocks = 1;
+	}
+
+	if (cbc_blocks > 0) {
+		unsigned int blocks;
+
+		skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
+					   cbc_blocks * AES_BLOCK_SIZE,
+					   req->iv);
+
+		err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+
+		while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+			kernel_neon_begin();
+			aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+					ctx->key_dec, rounds, blocks, walk.iv);
+			kernel_neon_end();
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (err)
+			return err;
+
+		if (req->cryptlen == AES_BLOCK_SIZE)
+			return 0;
+
+		dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
+					     rctx->subreq.cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
+					       rctx->subreq.cryptlen);
+	}
+
+	/* handle ciphertext stealing */
+	skcipher_request_set_crypt(&rctx->subreq, src, dst,
+				   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+				   req->iv);
+
+	err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+	if (err)
+		return err;
+
+	kernel_neon_begin();
+	aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+			    ctx->key_dec, rounds, walk.nbytes, walk.iv);
+	kernel_neon_end();
+
+	return skcipher_walk_done(&walk, 0);
+}
+
 static int ctr_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -222,7 +374,7 @@ static int ctr_encrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+				ctx->key_enc, rounds, blocks, walk.iv);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -238,7 +390,7 @@ static int ctr_encrypt(struct skcipher_request *req)
 		blocks = -1;
 
 		kernel_neon_begin();
-		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
+		aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
 				blocks, walk.iv);
 		kernel_neon_end();
 		crypto_xor_cpy(tdst, tsrc, tail, nbytes);
@@ -272,8 +424,8 @@ static int xts_encrypt(struct skcipher_request *req)
 	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
 		kernel_neon_begin();
 		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key1.key_enc, rounds, blocks,
-				(u8 *)ctx->key2.key_enc, walk.iv, first);
+				ctx->key1.key_enc, rounds, blocks,
+				ctx->key2.key_enc, walk.iv, first);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -294,8 +446,8 @@ static int xts_decrypt(struct skcipher_request *req)
 	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
 		kernel_neon_begin();
 		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				(u8 *)ctx->key1.key_dec, rounds, blocks,
-				(u8 *)ctx->key2.key_enc, walk.iv, first);
+				ctx->key1.key_dec, rounds, blocks,
+				ctx->key2.key_enc, walk.iv, first);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
@@ -336,6 +488,24 @@ static struct skcipher_alg aes_algs[] = { {
 	.decrypt	= cbc_decrypt,
 }, {
 	.base = {
+		.cra_name		= "__cts(cbc(aes))",
+		.cra_driver_name	= "__cts-cbc-aes-" MODE,
+		.cra_priority		= PRIO,
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+		.cra_module		= THIS_MODULE,
+	},
+	.min_keysize	= AES_MIN_KEY_SIZE,
+	.max_keysize	= AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.walksize	= 2 * AES_BLOCK_SIZE,
+	.setkey		= skcipher_aes_setkey,
+	.encrypt	= cts_cbc_encrypt,
+	.decrypt	= cts_cbc_decrypt,
+	.init		= cts_cbc_init_tfm,
+}, {
+	.base = {
 		.cra_name		= "__ctr(aes)",
 		.cra_driver_name	= "__ctr-aes-" MODE,
 		.cra_priority		= PRIO,
@@ -412,7 +582,6 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
 {
 	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
 	be128 *consts = (be128 *)ctx->consts;
-	u8 *rk = (u8 *)ctx->key.key_enc;
 	int rounds = 6 + key_len / 4;
 	int err;
 
@@ -422,7 +591,8 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
 
 	/* encrypt the zero vector */
 	kernel_neon_begin();
-	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
+	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
+			rounds, 1);
 	kernel_neon_end();
 
 	cmac_gf128_mul_by_x(consts, consts);
@@ -441,7 +611,6 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
 	};
 
 	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
-	u8 *rk = (u8 *)ctx->key.key_enc;
 	int rounds = 6 + key_len / 4;
 	u8 key[AES_BLOCK_SIZE];
 	int err;
@@ -451,8 +620,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
 		return err;
 
 	kernel_neon_begin();
-	aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
-	aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
+	aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
+	aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
 	kernel_neon_end();
 
 	return cbcmac_setkey(tfm, key, sizeof(key));
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 483a7130cf0e..67700045a0e0 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -14,12 +14,12 @@
 	.align		4
 
 aes_encrypt_block4x:
-	encrypt_block4x	v0, v1, v2, v3, w22, x21, x8, w7
+	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
 ENDPROC(aes_encrypt_block4x)
 
 aes_decrypt_block4x:
-	decrypt_block4x	v0, v1, v2, v3, w22, x21, x8, w7
+	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
 ENDPROC(aes_decrypt_block4x)
 
@@ -31,71 +31,57 @@ ENDPROC(aes_decrypt_block4x)
 	 */
 
 AES_ENTRY(aes_ecb_encrypt)
-	frame_push	5
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-
-.Lecbencrestart:
-	enc_prepare	w22, x21, x5
+	enc_prepare	w3, x2, x5
 
 .LecbencloopNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lecbenc1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
 	bl		aes_encrypt_block4x
-	st1		{v0.16b-v3.16b}, [x19], #64
-	cond_yield_neon	.Lecbencrestart
+	st1		{v0.16b-v3.16b}, [x0], #64
 	b		.LecbencloopNx
 .Lecbenc1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lecbencout
 .Lecbencloop:
-	ld1		{v0.16b}, [x20], #16		/* get next pt block */
-	encrypt_block	v0, w22, x21, x5, w6
-	st1		{v0.16b}, [x19], #16
-	subs		w23, w23, #1
+	ld1		{v0.16b}, [x1], #16		/* get next pt block */
+	encrypt_block	v0, w3, x2, x5, w6
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
 	bne		.Lecbencloop
 .Lecbencout:
-	frame_pop
+	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_ecb_encrypt)
 
 
 AES_ENTRY(aes_ecb_decrypt)
-	frame_push	5
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-
-.Lecbdecrestart:
-	dec_prepare	w22, x21, x5
+	dec_prepare	w3, x2, x5
 
 .LecbdecloopNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lecbdec1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
 	bl		aes_decrypt_block4x
-	st1		{v0.16b-v3.16b}, [x19], #64
-	cond_yield_neon	.Lecbdecrestart
+	st1		{v0.16b-v3.16b}, [x0], #64
 	b		.LecbdecloopNx
 .Lecbdec1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lecbdecout
 .Lecbdecloop:
-	ld1		{v0.16b}, [x20], #16		/* get next ct block */
-	decrypt_block	v0, w22, x21, x5, w6
-	st1		{v0.16b}, [x19], #16
-	subs		w23, w23, #1
+	ld1		{v0.16b}, [x1], #16		/* get next ct block */
+	decrypt_block	v0, w3, x2, x5, w6
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
 	bne		.Lecbdecloop
 .Lecbdecout:
-	frame_pop
+	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_ecb_decrypt)
 
@@ -108,162 +94,211 @@ AES_ENDPROC(aes_ecb_decrypt)
 	 */
 
 AES_ENTRY(aes_cbc_encrypt)
-	frame_push	6
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-
-.Lcbcencrestart:
-	ld1		{v4.16b}, [x24]			/* get iv */
-	enc_prepare	w22, x21, x6
+	ld1		{v4.16b}, [x5]			/* get iv */
+	enc_prepare	w3, x2, x6
 
 .Lcbcencloop4x:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lcbcenc1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
 	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
-	encrypt_block	v0, w22, x21, x6, w7
+	encrypt_block	v0, w3, x2, x6, w7
 	eor		v1.16b, v1.16b, v0.16b
-	encrypt_block	v1, w22, x21, x6, w7
+	encrypt_block	v1, w3, x2, x6, w7
 	eor		v2.16b, v2.16b, v1.16b
-	encrypt_block	v2, w22, x21, x6, w7
+	encrypt_block	v2, w3, x2, x6, w7
 	eor		v3.16b, v3.16b, v2.16b
-	encrypt_block	v3, w22, x21, x6, w7
-	st1		{v0.16b-v3.16b}, [x19], #64
+	encrypt_block	v3, w3, x2, x6, w7
+	st1		{v0.16b-v3.16b}, [x0], #64
 	mov		v4.16b, v3.16b
-	st1		{v4.16b}, [x24]			/* return iv */
-	cond_yield_neon	.Lcbcencrestart
 	b		.Lcbcencloop4x
 .Lcbcenc1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lcbcencout
 .Lcbcencloop:
-	ld1		{v0.16b}, [x20], #16		/* get next pt block */
+	ld1		{v0.16b}, [x1], #16		/* get next pt block */
 	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
-	encrypt_block	v4, w22, x21, x6, w7
-	st1		{v4.16b}, [x19], #16
-	subs		w23, w23, #1
+	encrypt_block	v4, w3, x2, x6, w7
+	st1		{v4.16b}, [x0], #16
+	subs		w4, w4, #1
 	bne		.Lcbcencloop
 .Lcbcencout:
-	st1		{v4.16b}, [x24]			/* return iv */
-	frame_pop
+	st1		{v4.16b}, [x5]			/* return iv */
 	ret
 AES_ENDPROC(aes_cbc_encrypt)
 
 
 AES_ENTRY(aes_cbc_decrypt)
-	frame_push	6
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-.Lcbcdecrestart:
-	ld1		{v7.16b}, [x24]			/* get iv */
-	dec_prepare	w22, x21, x6
+	ld1		{v7.16b}, [x5]			/* get iv */
+	dec_prepare	w3, x2, x6
 
 .LcbcdecloopNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lcbcdec1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
 	mov		v4.16b, v0.16b
 	mov		v5.16b, v1.16b
 	mov		v6.16b, v2.16b
 	bl		aes_decrypt_block4x
-	sub		x20, x20, #16
+	sub		x1, x1, #16
 	eor		v0.16b, v0.16b, v7.16b
 	eor		v1.16b, v1.16b, v4.16b
-	ld1		{v7.16b}, [x20], #16		/* reload 1 ct block */
+	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
 	eor		v2.16b, v2.16b, v5.16b
 	eor		v3.16b, v3.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x19], #64
-	st1		{v7.16b}, [x24]			/* return iv */
-	cond_yield_neon	.Lcbcdecrestart
+	st1		{v0.16b-v3.16b}, [x0], #64
 	b		.LcbcdecloopNx
 .Lcbcdec1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lcbcdecout
 .Lcbcdecloop:
-	ld1		{v1.16b}, [x20], #16		/* get next ct block */
+	ld1		{v1.16b}, [x1], #16		/* get next ct block */
 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
-	decrypt_block	v0, w22, x21, x6, w7
+	decrypt_block	v0, w3, x2, x6, w7
 	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
 	mov		v7.16b, v1.16b			/* ct is next iv */
-	st1		{v0.16b}, [x19], #16
-	subs		w23, w23, #1
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
-	st1		{v7.16b}, [x24]			/* return iv */
-	frame_pop
+	st1		{v7.16b}, [x5]			/* return iv */
+	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_cbc_decrypt)
 
 
 	/*
+	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+	 *		       int rounds, int bytes, u8 const iv[])
+	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+	 *		       int rounds, int bytes, u8 const iv[])
+	 */
+
+AES_ENTRY(aes_cbc_cts_encrypt)
+	adr_l		x8, .Lcts_permute_table
+	sub		x4, x4, #16
+	add		x9, x8, #32
+	add		x8, x8, x4
+	sub		x9, x9, x4
+	ld1		{v3.16b}, [x8]
+	ld1		{v4.16b}, [x9]
+
+	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
+	ld1		{v1.16b}, [x1]
+
+	ld1		{v5.16b}, [x5]			/* get iv */
+	enc_prepare	w3, x2, x6
+
+	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
+	tbl		v1.16b, {v1.16b}, v4.16b
+	encrypt_block	v0, w3, x2, x6, w7
+
+	eor		v1.16b, v1.16b, v0.16b
+	tbl		v0.16b, {v0.16b}, v3.16b
+	encrypt_block	v1, w3, x2, x6, w7
+
+	add		x4, x0, x4
+	st1		{v0.16b}, [x4]			/* overlapping stores */
+	st1		{v1.16b}, [x0]
+	ret
+AES_ENDPROC(aes_cbc_cts_encrypt)
+
+AES_ENTRY(aes_cbc_cts_decrypt)
+	adr_l		x8, .Lcts_permute_table
+	sub		x4, x4, #16
+	add		x9, x8, #32
+	add		x8, x8, x4
+	sub		x9, x9, x4
+	ld1		{v3.16b}, [x8]
+	ld1		{v4.16b}, [x9]
+
+	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
+	ld1		{v1.16b}, [x1]
+
+	ld1		{v5.16b}, [x5]			/* get iv */
+	dec_prepare	w3, x2, x6
+
+	tbl		v2.16b, {v1.16b}, v4.16b
+	decrypt_block	v0, w3, x2, x6, w7
+	eor		v2.16b, v2.16b, v0.16b
+
+	tbx		v0.16b, {v1.16b}, v4.16b
+	tbl		v2.16b, {v2.16b}, v3.16b
+	decrypt_block	v0, w3, x2, x6, w7
+	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
+
+	add		x4, x0, x4
+	st1		{v2.16b}, [x4]			/* overlapping stores */
+	st1		{v0.16b}, [x0]
+	ret
+AES_ENDPROC(aes_cbc_cts_decrypt)
+
+	.section	".rodata", "a"
+	.align		6
+.Lcts_permute_table:
+	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
+	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	.previous
+
+
+	/*
 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		   int blocks, u8 ctr[])
 	 */
 
 AES_ENTRY(aes_ctr_encrypt)
-	frame_push	6
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-
-.Lctrrestart:
-	enc_prepare	w22, x21, x6
-	ld1		{v4.16b}, [x24]
+	enc_prepare	w3, x2, x6
+	ld1		{v4.16b}, [x5]
 
 	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
 	rev		x6, x6
+	cmn		w6, w4			/* 32 bit overflow? */
+	bcs		.Lctrloop
 .LctrloopNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lctr1x
-	cmn		w6, #4			/* 32 bit overflow? */
-	bcs		.Lctr1x
-	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
-	dup		v7.4s, w6
+	add		w7, w6, #1
 	mov		v0.16b, v4.16b
-	add		v7.4s, v7.4s, v8.4s
+	add		w8, w6, #2
 	mov		v1.16b, v4.16b
-	rev32		v8.16b, v7.16b
+	add		w9, w6, #3
 	mov		v2.16b, v4.16b
+	rev		w7, w7
 	mov		v3.16b, v4.16b
-	mov		v1.s[3], v8.s[0]
-	mov		v2.s[3], v8.s[1]
-	mov		v3.s[3], v8.s[2]
-	ld1		{v5.16b-v7.16b}, [x20], #48	/* get 3 input blocks */
+	rev		w8, w8
+	mov		v1.s[3], w7
+	rev		w9, w9
+	mov		v2.s[3], w8
+	mov		v3.s[3], w9
+	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
 	bl		aes_encrypt_block4x
 	eor		v0.16b, v5.16b, v0.16b
-	ld1		{v5.16b}, [x20], #16		/* get 1 input block  */
+	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
 	eor		v1.16b, v6.16b, v1.16b
 	eor		v2.16b, v7.16b, v2.16b
 	eor		v3.16b, v5.16b, v3.16b
-	st1		{v0.16b-v3.16b}, [x19], #64
+	st1		{v0.16b-v3.16b}, [x0], #64
 	add		x6, x6, #4
 	rev		x7, x6
 	ins		v4.d[1], x7
-	cbz		w23, .Lctrout
-	st1		{v4.16b}, [x24]		/* return next CTR value */
-	cond_yield_neon	.Lctrrestart
+	cbz		w4, .Lctrout
 	b		.LctrloopNx
 .Lctr1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lctrout
 .Lctrloop:
 	mov		v0.16b, v4.16b
-	encrypt_block	v0, w22, x21, x8, w7
+	encrypt_block	v0, w3, x2, x8, w7
 
 	adds		x6, x6, #1		/* increment BE ctr */
 	rev		x7, x6
@@ -271,22 +306,22 @@ AES_ENTRY(aes_ctr_encrypt)
 	bcs		.Lctrcarry		/* overflow? */
 
 .Lctrcarrydone:
-	subs		w23, w23, #1
+	subs		w4, w4, #1
 	bmi		.Lctrtailblock		/* blocks <0 means tail block */
-	ld1		{v3.16b}, [x20], #16
+	ld1		{v3.16b}, [x1], #16
 	eor		v3.16b, v0.16b, v3.16b
-	st1		{v3.16b}, [x19], #16
+	st1		{v3.16b}, [x0], #16
 	bne		.Lctrloop
 
 .Lctrout:
-	st1		{v4.16b}, [x24]		/* return next CTR value */
-.Lctrret:
-	frame_pop
+	st1		{v4.16b}, [x5]		/* return next CTR value */
+	ldp		x29, x30, [sp], #16
 	ret
 
 .Lctrtailblock:
-	st1		{v0.16b}, [x19]
-	b		.Lctrret
+	st1		{v0.16b}, [x0]
+	ldp		x29, x30, [sp], #16
+	ret
 
 .Lctrcarry:
 	umov		x7, v4.d[0]		/* load upper word of ctr  */
@@ -296,7 +331,6 @@ AES_ENTRY(aes_ctr_encrypt)
 	ins		v4.d[0], x7
 	b		.Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
-	.ltorg
 
 
 	/*
@@ -306,150 +340,132 @@ AES_ENDPROC(aes_ctr_encrypt)
 	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
 	 */
 
-	.macro		next_tweak, out, in, const, tmp
+	.macro		next_tweak, out, in, tmp
 	sshr		\tmp\().2d,  \in\().2d,   #63
-	and		\tmp\().16b, \tmp\().16b, \const\().16b
+	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
 	add		\out\().2d,  \in\().2d,   \in\().2d
 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
 	eor		\out\().16b, \out\().16b, \tmp\().16b
 	.endm
 
-.Lxts_mul_x:
-CPU_LE(	.quad		1, 0x87		)
-CPU_BE(	.quad		0x87, 1		)
+	.macro		xts_load_mask, tmp
+	movi		xtsmask.2s, #0x1
+	movi		\tmp\().2s, #0x87
+	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
+	.endm
 
 AES_ENTRY(aes_xts_encrypt)
-	frame_push	6
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x6
-
-	ld1		{v4.16b}, [x24]
+	ld1		{v4.16b}, [x6]
+	xts_load_mask	v8
 	cbz		w7, .Lxtsencnotfirst
 
 	enc_prepare	w3, x5, x8
 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
 	enc_switch_key	w3, x2, x8
-	ldr		q7, .Lxts_mul_x
 	b		.LxtsencNx
 
-.Lxtsencrestart:
-	ld1		{v4.16b}, [x24]
 .Lxtsencnotfirst:
-	enc_prepare	w22, x21, x8
+	enc_prepare	w3, x2, x8
 .LxtsencloopNx:
-	ldr		q7, .Lxts_mul_x
-	next_tweak	v4, v4, v7, v8
+	next_tweak	v4, v4, v8
 .LxtsencNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lxtsenc1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 pt blocks */
-	next_tweak	v5, v4, v7, v8
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
+	next_tweak	v5, v4, v8
 	eor		v0.16b, v0.16b, v4.16b
-	next_tweak	v6, v5, v7, v8
+	next_tweak	v6, v5, v8
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	next_tweak	v7, v6, v7, v8
+	next_tweak	v7, v6, v8
 	eor		v3.16b, v3.16b, v7.16b
 	bl		aes_encrypt_block4x
 	eor		v3.16b, v3.16b, v7.16b
 	eor		v0.16b, v0.16b, v4.16b
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x19], #64
+	st1		{v0.16b-v3.16b}, [x0], #64
 	mov		v4.16b, v7.16b
-	cbz		w23, .Lxtsencout
-	st1		{v4.16b}, [x24]
-	cond_yield_neon	.Lxtsencrestart
+	cbz		w4, .Lxtsencout
+	xts_reload_mask	v8
 	b		.LxtsencloopNx
 .Lxtsenc1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lxtsencout
 .Lxtsencloop:
-	ld1		{v1.16b}, [x20], #16
+	ld1		{v1.16b}, [x1], #16
 	eor		v0.16b, v1.16b, v4.16b
-	encrypt_block	v0, w22, x21, x8, w7
+	encrypt_block	v0, w3, x2, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
-	st1		{v0.16b}, [x19], #16
-	subs		w23, w23, #1
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
 	beq		.Lxtsencout
-	next_tweak	v4, v4, v7, v8
+	next_tweak	v4, v4, v8
 	b		.Lxtsencloop
 .Lxtsencout:
-	st1		{v4.16b}, [x24]
-	frame_pop
+	st1		{v4.16b}, [x6]
+	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_xts_encrypt)
 
 
 AES_ENTRY(aes_xts_decrypt)
-	frame_push	6
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x6
-
-	ld1		{v4.16b}, [x24]
+	ld1		{v4.16b}, [x6]
+	xts_load_mask	v8
 	cbz		w7, .Lxtsdecnotfirst
 
 	enc_prepare	w3, x5, x8
 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
 	dec_prepare	w3, x2, x8
-	ldr		q7, .Lxts_mul_x
 	b		.LxtsdecNx
 
-.Lxtsdecrestart:
-	ld1		{v4.16b}, [x24]
 .Lxtsdecnotfirst:
-	dec_prepare	w22, x21, x8
+	dec_prepare	w3, x2, x8
 .LxtsdecloopNx:
-	ldr		q7, .Lxts_mul_x
-	next_tweak	v4, v4, v7, v8
+	next_tweak	v4, v4, v8
 .LxtsdecNx:
-	subs		w23, w23, #4
+	subs		w4, w4, #4
 	bmi		.Lxtsdec1x
-	ld1		{v0.16b-v3.16b}, [x20], #64	/* get 4 ct blocks */
-	next_tweak	v5, v4, v7, v8
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	next_tweak	v5, v4, v8
 	eor		v0.16b, v0.16b, v4.16b
-	next_tweak	v6, v5, v7, v8
+	next_tweak	v6, v5, v8
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	next_tweak	v7, v6, v7, v8
+	next_tweak	v7, v6, v8
 	eor		v3.16b, v3.16b, v7.16b
 	bl		aes_decrypt_block4x
 	eor		v3.16b, v3.16b, v7.16b
 	eor		v0.16b, v0.16b, v4.16b
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	st1		{v0.16b-v3.16b}, [x19], #64
+	st1		{v0.16b-v3.16b}, [x0], #64
 	mov		v4.16b, v7.16b
-	cbz		w23, .Lxtsdecout
-	st1		{v4.16b}, [x24]
-	cond_yield_neon	.Lxtsdecrestart
+	cbz		w4, .Lxtsdecout
+	xts_reload_mask	v8
 	b		.LxtsdecloopNx
 .Lxtsdec1x:
-	adds		w23, w23, #4
+	adds		w4, w4, #4
 	beq		.Lxtsdecout
 .Lxtsdecloop:
-	ld1		{v1.16b}, [x20], #16
+	ld1		{v1.16b}, [x1], #16
 	eor		v0.16b, v1.16b, v4.16b
-	decrypt_block	v0, w22, x21, x8, w7
+	decrypt_block	v0, w3, x2, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
-	st1		{v0.16b}, [x19], #16
-	subs		w23, w23, #1
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
 	beq		.Lxtsdecout
-	next_tweak	v4, v4, v7, v8
+	next_tweak	v4, v4, v8
 	b		.Lxtsdecloop
 .Lxtsdecout:
-	st1		{v4.16b}, [x24]
-	frame_pop
+	st1		{v4.16b}, [x6]
+	ldp		x29, x30, [sp], #16
 	ret
 AES_ENDPROC(aes_xts_decrypt)
 
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 1c7b45b7268e..29100f692e8a 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -14,6 +14,12 @@
 #define AES_ENTRY(func)		ENTRY(neon_ ## func)
 #define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
 
+	xtsmask		.req	v7
+
+	.macro		xts_reload_mask, tmp
+	xts_load_mask	\tmp
+	.endm
+
 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
 	sshr		\temp, \in, #7
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
deleted file mode 100644
index 8061bf0f9c66..000000000000
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please  visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.section	".rodata", "a"
-	.align		6
-	.cpu		generic+crypto+crc
-
-.Lcrc32_constants:
-	/*
-	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
-	 * #define CONSTANT_R1  0x154442bd4LL
-	 *
-	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
-	 * #define CONSTANT_R2  0x1c6e41596LL
-	 */
-	.octa		0x00000001c6e415960000000154442bd4
-
-	/*
-	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
-	 * #define CONSTANT_R3  0x1751997d0LL
-	 *
-	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
-	 * #define CONSTANT_R4  0x0ccaa009eLL
-	 */
-	.octa		0x00000000ccaa009e00000001751997d0
-
-	/*
-	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
-	 * #define CONSTANT_R5  0x163cd6124LL
-	 */
-	.quad		0x0000000163cd6124
-	.quad		0x00000000FFFFFFFF
-
-	/*
-	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
-	 *
-	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
-	 *                                                      = 0x1F7011641LL
-	 * #define CONSTANT_RU  0x1F7011641LL
-	 */
-	.octa		0x00000001F701164100000001DB710641
-
-.Lcrc32c_constants:
-	.octa		0x000000009e4addf800000000740eef02
-	.octa		0x000000014cd00bd600000000f20c0dfe
-	.quad		0x00000000dd45aab8
-	.quad		0x00000000FFFFFFFF
-	.octa		0x00000000dea713f10000000105ec76f0
-
-	vCONSTANT	.req	v0
-	dCONSTANT	.req	d0
-	qCONSTANT	.req	q0
-
-	BUF		.req	x19
-	LEN		.req	x20
-	CRC		.req	x21
-	CONST		.req	x22
-
-	vzr		.req	v9
-
-	/**
-	 * Calculate crc32
-	 * BUF - buffer
-	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
-	 * CRC - initial crc32
-	 * return %eax crc32
-	 * uint crc32_pmull_le(unsigned char const *buffer,
-	 *                     size_t len, uint crc32)
-	 */
-	.text
-ENTRY(crc32_pmull_le)
-	adr_l		x3, .Lcrc32_constants
-	b		0f
-
-ENTRY(crc32c_pmull_le)
-	adr_l		x3, .Lcrc32c_constants
-
-0:	frame_push	4, 64
-
-	mov		BUF, x0
-	mov		LEN, x1
-	mov		CRC, x2
-	mov		CONST, x3
-
-	bic		LEN, LEN, #15
-	ld1		{v1.16b-v4.16b}, [BUF], #0x40
-	movi		vzr.16b, #0
-	fmov		dCONSTANT, CRC
-	eor		v1.16b, v1.16b, vCONSTANT.16b
-	sub		LEN, LEN, #0x40
-	cmp		LEN, #0x40
-	b.lt		less_64
-
-	ldr		qCONSTANT, [CONST]
-
-loop_64:		/* 64 bytes Full cache line folding */
-	sub		LEN, LEN, #0x40
-
-	pmull2		v5.1q, v1.2d, vCONSTANT.2d
-	pmull2		v6.1q, v2.2d, vCONSTANT.2d
-	pmull2		v7.1q, v3.2d, vCONSTANT.2d
-	pmull2		v8.1q, v4.2d, vCONSTANT.2d
-
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	pmull		v2.1q, v2.1d, vCONSTANT.1d
-	pmull		v3.1q, v3.1d, vCONSTANT.1d
-	pmull		v4.1q, v4.1d, vCONSTANT.1d
-
-	eor		v1.16b, v1.16b, v5.16b
-	ld1		{v5.16b}, [BUF], #0x10
-	eor		v2.16b, v2.16b, v6.16b
-	ld1		{v6.16b}, [BUF], #0x10
-	eor		v3.16b, v3.16b, v7.16b
-	ld1		{v7.16b}, [BUF], #0x10
-	eor		v4.16b, v4.16b, v8.16b
-	ld1		{v8.16b}, [BUF], #0x10
-
-	eor		v1.16b, v1.16b, v5.16b
-	eor		v2.16b, v2.16b, v6.16b
-	eor		v3.16b, v3.16b, v7.16b
-	eor		v4.16b, v4.16b, v8.16b
-
-	cmp		LEN, #0x40
-	b.lt		less_64
-
-	if_will_cond_yield_neon
-	stp		q1, q2, [sp, #.Lframe_local_offset]
-	stp		q3, q4, [sp, #.Lframe_local_offset + 32]
-	do_cond_yield_neon
-	ldp		q1, q2, [sp, #.Lframe_local_offset]
-	ldp		q3, q4, [sp, #.Lframe_local_offset + 32]
-	ldr		qCONSTANT, [CONST]
-	movi		vzr.16b, #0
-	endif_yield_neon
-	b		loop_64
-
-less_64:		/* Folding cache line into 128bit */
-	ldr		qCONSTANT, [CONST, #16]
-
-	pmull2		v5.1q, v1.2d, vCONSTANT.2d
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v5.16b
-	eor		v1.16b, v1.16b, v2.16b
-
-	pmull2		v5.1q, v1.2d, vCONSTANT.2d
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v5.16b
-	eor		v1.16b, v1.16b, v3.16b
-
-	pmull2		v5.1q, v1.2d, vCONSTANT.2d
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v5.16b
-	eor		v1.16b, v1.16b, v4.16b
-
-	cbz		LEN, fold_64
-
-loop_16:		/* Folding rest buffer into 128bit */
-	subs		LEN, LEN, #0x10
-
-	ld1		{v2.16b}, [BUF], #0x10
-	pmull2		v5.1q, v1.2d, vCONSTANT.2d
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v5.16b
-	eor		v1.16b, v1.16b, v2.16b
-
-	b.ne		loop_16
-
-fold_64:
-	/* perform the last 64 bit fold, also adds 32 zeroes
-	 * to the input stream */
-	ext		v2.16b, v1.16b, v1.16b, #8
-	pmull2		v2.1q, v2.2d, vCONSTANT.2d
-	ext		v1.16b, v1.16b, vzr.16b, #8
-	eor		v1.16b, v1.16b, v2.16b
-
-	/* final 32-bit fold */
-	ldr		dCONSTANT, [CONST, #32]
-	ldr		d3, [CONST, #40]
-
-	ext		v2.16b, v1.16b, vzr.16b, #4
-	and		v1.16b, v1.16b, v3.16b
-	pmull		v1.1q, v1.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v2.16b
-
-	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
-	ldr		qCONSTANT, [CONST, #48]
-
-	and		v2.16b, v1.16b, v3.16b
-	ext		v2.16b, vzr.16b, v2.16b, #8
-	pmull2		v2.1q, v2.2d, vCONSTANT.2d
-	and		v2.16b, v2.16b, v3.16b
-	pmull		v2.1q, v2.1d, vCONSTANT.1d
-	eor		v1.16b, v1.16b, v2.16b
-	mov		w0, v1.s[1]
-
-	frame_pop
-	ret
-ENDPROC(crc32_pmull_le)
-ENDPROC(crc32c_pmull_le)
-
-	.macro		__crc32, c
-0:	subs		x2, x2, #16
-	b.mi		8f
-	ldp		x3, x4, [x1], #16
-CPU_BE(	rev		x3, x3		)
-CPU_BE(	rev		x4, x4		)
-	crc32\c\()x	w0, w0, x3
-	crc32\c\()x	w0, w0, x4
-	b.ne		0b
-	ret
-
-8:	tbz		x2, #3, 4f
-	ldr		x3, [x1], #8
-CPU_BE(	rev		x3, x3		)
-	crc32\c\()x	w0, w0, x3
-4:	tbz		x2, #2, 2f
-	ldr		w3, [x1], #4
-CPU_BE(	rev		w3, w3		)
-	crc32\c\()w	w0, w0, w3
-2:	tbz		x2, #1, 1f
-	ldrh		w3, [x1], #2
-CPU_BE(	rev16		w3, w3		)
-	crc32\c\()h	w0, w0, w3
-1:	tbz		x2, #0, 0f
-	ldrb		w3, [x1]
-	crc32\c\()b	w0, w0, w3
-0:	ret
-	.endm
-
-	.align		5
-ENTRY(crc32_armv8_le)
-	__crc32
-ENDPROC(crc32_armv8_le)
-
-	.align		5
-ENTRY(crc32c_armv8_le)
-	__crc32		c
-ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
deleted file mode 100644
index 34b4e3d46aab..000000000000
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-
-#define PMULL_MIN_LEN		64L	/* minimum size of buffer
-					 * for crc32_pmull_le_16 */
-#define SCALE_F			16L	/* size of NEON register */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
-static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
-
-static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
-{
-	u32 *key = crypto_tfm_ctx(tfm);
-
-	*key = 0;
-	return 0;
-}
-
-static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
-{
-	u32 *key = crypto_tfm_ctx(tfm);
-
-	*key = ~0;
-	return 0;
-}
-
-static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
-			      unsigned int keylen)
-{
-	u32 *mctx = crypto_shash_ctx(hash);
-
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	*mctx = le32_to_cpup((__le32 *)key);
-	return 0;
-}
-
-static int crc32_pmull_init(struct shash_desc *desc)
-{
-	u32 *mctx = crypto_shash_ctx(desc->tfm);
-	u32 *crc = shash_desc_ctx(desc);
-
-	*crc = *mctx;
-	return 0;
-}
-
-static int crc32_update(struct shash_desc *desc, const u8 *data,
-			unsigned int length)
-{
-	u32 *crc = shash_desc_ctx(desc);
-
-	*crc = crc32_armv8_le(*crc, data, length);
-	return 0;
-}
-
-static int crc32c_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	u32 *crc = shash_desc_ctx(desc);
-
-	*crc = crc32c_armv8_le(*crc, data, length);
-	return 0;
-}
-
-static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	u32 *crc = shash_desc_ctx(desc);
-	unsigned int l;
-
-	if ((u64)data % SCALE_F) {
-		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
-		*crc = fallback_crc32(*crc, data, l);
-
-		data += l;
-		length -= l;
-	}
-
-	if (length >= PMULL_MIN_LEN && may_use_simd()) {
-		l = round_down(length, SCALE_F);
-
-		kernel_neon_begin();
-		*crc = crc32_pmull_le(data, l, *crc);
-		kernel_neon_end();
-
-		data += l;
-		length -= l;
-	}
-
-	if (length > 0)
-		*crc = fallback_crc32(*crc, data, length);
-
-	return 0;
-}
-
-static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	u32 *crc = shash_desc_ctx(desc);
-	unsigned int l;
-
-	if ((u64)data % SCALE_F) {
-		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
-		*crc = fallback_crc32c(*crc, data, l);
-
-		data += l;
-		length -= l;
-	}
-
-	if (length >= PMULL_MIN_LEN && may_use_simd()) {
-		l = round_down(length, SCALE_F);
-
-		kernel_neon_begin();
-		*crc = crc32c_pmull_le(data, l, *crc);
-		kernel_neon_end();
-
-		data += l;
-		length -= l;
-	}
-
-	if (length > 0) {
-		*crc = fallback_crc32c(*crc, data, length);
-	}
-
-	return 0;
-}
-
-static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
-{
-	u32 *crc = shash_desc_ctx(desc);
-
-	put_unaligned_le32(*crc, out);
-	return 0;
-}
-
-static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
-{
-	u32 *crc = shash_desc_ctx(desc);
-
-	put_unaligned_le32(~*crc, out);
-	return 0;
-}
-
-static struct shash_alg crc32_pmull_algs[] = { {
-	.setkey			= crc32_pmull_setkey,
-	.init			= crc32_pmull_init,
-	.update			= crc32_update,
-	.final			= crc32_pmull_final,
-	.descsize		= sizeof(u32),
-	.digestsize		= sizeof(u32),
-
-	.base.cra_ctxsize	= sizeof(u32),
-	.base.cra_init		= crc32_pmull_cra_init,
-	.base.cra_name		= "crc32",
-	.base.cra_driver_name	= "crc32-arm64-ce",
-	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-	.base.cra_blocksize	= 1,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.setkey			= crc32_pmull_setkey,
-	.init			= crc32_pmull_init,
-	.update			= crc32c_update,
-	.final			= crc32c_pmull_final,
-	.descsize		= sizeof(u32),
-	.digestsize		= sizeof(u32),
-
-	.base.cra_ctxsize	= sizeof(u32),
-	.base.cra_init		= crc32c_pmull_cra_init,
-	.base.cra_name		= "crc32c",
-	.base.cra_driver_name	= "crc32c-arm64-ce",
-	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-	.base.cra_blocksize	= 1,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init crc32_pmull_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
-		crc32_pmull_algs[0].update = crc32_pmull_update;
-		crc32_pmull_algs[1].update = crc32c_pmull_update;
-
-		if (elf_hwcap & HWCAP_CRC32) {
-			fallback_crc32 = crc32_armv8_le;
-			fallback_crc32c = crc32c_armv8_le;
-		} else {
-			fallback_crc32 = crc32_le;
-			fallback_crc32c = __crc32c_le;
-		}
-	} else if (!(elf_hwcap & HWCAP_CRC32)) {
-		return -ENODEV;
-	}
-	return crypto_register_shashes(crc32_pmull_algs,
-				       ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static void __exit crc32_pmull_mod_exit(void)
-{
-	crypto_unregister_shashes(crc32_pmull_algs,
-				  ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static const struct cpu_feature crc32_cpu_feature[] = {
-	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
-};
-MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
-
-module_init(crc32_pmull_mod_init);
-module_exit(crc32_pmull_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 663ea71cdb38..9e82e8e8ed05 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -80,7 +80,186 @@
 
 	vzr		.req	v13
 
-ENTRY(crc_t10dif_pmull)
+	ad		.req	v14
+	bd		.req	v10
+
+	k00_16		.req	v15
+	k32_48		.req	v16
+
+	t3		.req	v17
+	t4		.req	v18
+	t5		.req	v19
+	t6		.req	v20
+	t7		.req	v21
+	t8		.req	v22
+	t9		.req	v23
+
+	perm1		.req	v24
+	perm2		.req	v25
+	perm3		.req	v26
+	perm4		.req	v27
+
+	bd1		.req	v28
+	bd2		.req	v29
+	bd3		.req	v30
+	bd4		.req	v31
+
+	.macro		__pmull_init_p64
+	.endm
+
+	.macro		__pmull_pre_p64, bd
+	.endm
+
+	.macro		__pmull_init_p8
+	// k00_16 := 0x0000000000000000_000000000000ffff
+	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
+	movi		k32_48.2d, #0xffffffff
+	mov		k32_48.h[2], k32_48.h[0]
+	ushr		k00_16.2d, k32_48.2d, #32
+
+	// prepare the permutation vectors
+	mov_q		x5, 0x080f0e0d0c0b0a09
+	movi		perm4.8b, #8
+	dup		perm1.2d, x5
+	eor		perm1.16b, perm1.16b, perm4.16b
+	ushr		perm2.2d, perm1.2d, #8
+	ushr		perm3.2d, perm1.2d, #16
+	ushr		perm4.2d, perm1.2d, #24
+	sli		perm2.2d, perm1.2d, #56
+	sli		perm3.2d, perm1.2d, #48
+	sli		perm4.2d, perm1.2d, #40
+	.endm
+
+	.macro		__pmull_pre_p8, bd
+	tbl		bd1.16b, {\bd\().16b}, perm1.16b
+	tbl		bd2.16b, {\bd\().16b}, perm2.16b
+	tbl		bd3.16b, {\bd\().16b}, perm3.16b
+	tbl		bd4.16b, {\bd\().16b}, perm4.16b
+	.endm
+
+__pmull_p8_core:
+.L__pmull_p8_core:
+	ext		t4.8b, ad.8b, ad.8b, #1			// A1
+	ext		t5.8b, ad.8b, ad.8b, #2			// A2
+	ext		t6.8b, ad.8b, ad.8b, #3			// A3
+
+	pmull		t4.8h, t4.8b, bd.8b			// F = A1*B
+	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
+	pmull		t5.8h, t5.8b, bd.8b			// H = A2*B
+	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
+	pmull		t6.8h, t6.8b, bd.8b			// J = A3*B
+	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
+	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
+	b		0f
+
+.L__pmull_p8_core2:
+	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
+	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
+	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
+
+	pmull2		t4.8h, t4.16b, bd.16b			// F = A1*B
+	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
+	pmull2		t5.8h, t5.16b, bd.16b			// H = A2*B
+	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
+	pmull2		t6.8h, t6.16b, bd.16b			// J = A3*B
+	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
+	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
+
+0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
+	eor		t5.16b, t5.16b, t7.16b			// M = G + H
+	eor		t6.16b, t6.16b, t9.16b			// N = I + J
+
+	uzp1		t8.2d, t4.2d, t5.2d
+	uzp2		t4.2d, t4.2d, t5.2d
+	uzp1		t7.2d, t6.2d, t3.2d
+	uzp2		t6.2d, t6.2d, t3.2d
+
+	// t4 = (L) (P0 + P1) << 8
+	// t5 = (M) (P2 + P3) << 16
+	eor		t8.16b, t8.16b, t4.16b
+	and		t4.16b, t4.16b, k32_48.16b
+
+	// t6 = (N) (P4 + P5) << 24
+	// t7 = (K) (P6 + P7) << 32
+	eor		t7.16b, t7.16b, t6.16b
+	and		t6.16b, t6.16b, k00_16.16b
+
+	eor		t8.16b, t8.16b, t4.16b
+	eor		t7.16b, t7.16b, t6.16b
+
+	zip2		t5.2d, t8.2d, t4.2d
+	zip1		t4.2d, t8.2d, t4.2d
+	zip2		t3.2d, t7.2d, t6.2d
+	zip1		t6.2d, t7.2d, t6.2d
+
+	ext		t4.16b, t4.16b, t4.16b, #15
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ext		t6.16b, t6.16b, t6.16b, #13
+	ext		t3.16b, t3.16b, t3.16b, #12
+
+	eor		t4.16b, t4.16b, t5.16b
+	eor		t6.16b, t6.16b, t3.16b
+	ret
+ENDPROC(__pmull_p8_core)
+
+	.macro		__pmull_p8, rq, ad, bd, i
+	.ifnc		\bd, v10
+	.err
+	.endif
+	mov		ad.16b, \ad\().16b
+	.ifb		\i
+	pmull		\rq\().8h, \ad\().8b, bd.8b		// D = A*B
+	.else
+	pmull2		\rq\().8h, \ad\().16b, bd.16b		// D = A*B
+	.endif
+
+	bl		.L__pmull_p8_core\i
+
+	eor		\rq\().16b, \rq\().16b, t4.16b
+	eor		\rq\().16b, \rq\().16b, t6.16b
+	.endm
+
+	.macro		fold64, p, reg1, reg2
+	ldp		q11, q12, [arg2], #0x20
+
+	__pmull_\p	v8, \reg1, v10, 2
+	__pmull_\p	\reg1, \reg1, v10
+
+CPU_LE(	rev64		v11.16b, v11.16b		)
+CPU_LE(	rev64		v12.16b, v12.16b		)
+
+	__pmull_\p	v9, \reg2, v10, 2
+	__pmull_\p	\reg2, \reg2, v10
+
+CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
+CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+
+	eor		\reg1\().16b, \reg1\().16b, v8.16b
+	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	eor		\reg1\().16b, \reg1\().16b, v11.16b
+	eor		\reg2\().16b, \reg2\().16b, v12.16b
+	.endm
+
+	.macro		fold16, p, reg, rk
+	__pmull_\p	v8, \reg, v10
+	__pmull_\p	\reg, \reg, v10, 2
+	.ifnb		\rk
+	ldr_l		q10, \rk, x8
+	__pmull_pre_\p	v10
+	.endif
+	eor		v7.16b, v7.16b, v8.16b
+	eor		v7.16b, v7.16b, \reg\().16b
+	.endm
+
+	.macro		__pmull_p64, rd, rn, rm, n
+	.ifb		\n
+	pmull		\rd\().1q, \rn\().1d, \rm\().1d
+	.else
+	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
+	.endif
+	.endm
+
+	.macro		crc_t10dif_pmull, p
 	frame_push	3, 128
 
 	mov		arg1_low32, w0
@@ -89,6 +268,8 @@ ENTRY(crc_t10dif_pmull)
 
 	movi		vzr.16b, #0		// init zero register
 
+	__pmull_init_\p
+
 	// adjust the 16-bit initial_crc value, scale it to 32 bits
 	lsl		arg1_low32, arg1_low32, #16
 
@@ -96,7 +277,7 @@ ENTRY(crc_t10dif_pmull)
 	cmp		arg3, #256
 
 	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		_less_than_128
+	b.lt		.L_less_than_128_\@
 
 	// load the initial crc value
 	// crc value does not need to be byte-reflected, but it needs
@@ -137,6 +318,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
 					// type of pmull instruction
 					// will determine which constant to use
+	__pmull_pre_\p	v10
 
 	//
 	// we subtract 256 instead of 128 to save one instruction from the loop
@@ -147,41 +329,19 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	// buffer. The _fold_64_B_loop will fold 64B at a time
 	// until we have 64+y Bytes of buffer
 
-
 	// fold 64B at a time. This section of the code folds 4 vector
 	// registers in parallel
-_fold_64_B_loop:
+.L_fold_64_B_loop_\@:
 
-	.macro		fold64, reg1, reg2
-	ldp		q11, q12, [arg2], #0x20
-
-	pmull2		v8.1q, \reg1\().2d, v10.2d
-	pmull		\reg1\().1q, \reg1\().1d, v10.1d
-
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
-
-	pmull2		v9.1q, \reg2\().2d, v10.2d
-	pmull		\reg2\().1q, \reg2\().1d, v10.1d
-
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
-
-	eor		\reg1\().16b, \reg1\().16b, v8.16b
-	eor		\reg2\().16b, \reg2\().16b, v9.16b
-	eor		\reg1\().16b, \reg1\().16b, v11.16b
-	eor		\reg2\().16b, \reg2\().16b, v12.16b
-	.endm
-
-	fold64		v0, v1
-	fold64		v2, v3
-	fold64		v4, v5
-	fold64		v6, v7
+	fold64		\p, v0, v1
+	fold64		\p, v2, v3
+	fold64		\p, v4, v5
+	fold64		\p, v6, v7
 
 	subs		arg3, arg3, #128
 
 	// check if there is another 64B in the buffer to be able to fold
-	b.lt		_fold_64_B_end
+	b.lt		.L_fold_64_B_end_\@
 
 	if_will_cond_yield_neon
 	stp		q0, q1, [sp, #.Lframe_local_offset]
@@ -195,11 +355,13 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
 	ldr_l		q10, rk3, x8
 	movi		vzr.16b, #0		// init zero register
+	__pmull_init_\p
+	__pmull_pre_\p	v10
 	endif_yield_neon
 
-	b		_fold_64_B_loop
+	b		.L_fold_64_B_loop_\@
 
-_fold_64_B_end:
+.L_fold_64_B_end_\@:
 	// at this point, the buffer pointer is pointing at the last y Bytes
 	// of the buffer the 64B of folded data is in 4 of the vector
 	// registers: v0, v1, v2, v3
@@ -208,38 +370,29 @@ _fold_64_B_end:
 	// constants
 
 	ldr_l		q10, rk9, x8
+	__pmull_pre_\p	v10
 
-	.macro		fold16, reg, rk
-	pmull		v8.1q, \reg\().1d, v10.1d
-	pmull2		\reg\().1q, \reg\().2d, v10.2d
-	.ifnb		\rk
-	ldr_l		q10, \rk, x8
-	.endif
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, \reg\().16b
-	.endm
-
-	fold16		v0, rk11
-	fold16		v1, rk13
-	fold16		v2, rk15
-	fold16		v3, rk17
-	fold16		v4, rk19
-	fold16		v5, rk1
-	fold16		v6
+	fold16		\p, v0, rk11
+	fold16		\p, v1, rk13
+	fold16		\p, v2, rk15
+	fold16		\p, v3, rk17
+	fold16		\p, v4, rk19
+	fold16		\p, v5, rk1
+	fold16		\p, v6
 
 	// instead of 64, we add 48 to the loop counter to save 1 instruction
 	// from the loop instead of a cmp instruction, we use the negative
 	// flag with the jl instruction
 	adds		arg3, arg3, #(128-16)
-	b.lt		_final_reduction_for_128
+	b.lt		.L_final_reduction_for_128_\@
 
 	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
 	// continue folding 16B at a time
 
-_16B_reduction_loop:
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
+.L_16B_reduction_loop_\@:
+	__pmull_\p	v8, v7, v10
+	__pmull_\p	v7, v7, v10, 2
 	eor		v7.16b, v7.16b, v8.16b
 
 	ldr		q0, [arg2], #16
@@ -251,22 +404,22 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	// instead of a cmp instruction, we utilize the flags with the
 	// jge instruction equivalent of: cmp arg3, 16-16
 	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		_16B_reduction_loop
+	b.ge		.L_16B_reduction_loop_\@
 
 	// now we have 16+z bytes left to reduce, where 0<= z < 16.
 	// first, we reduce the data in the xmm7 register
 
-_final_reduction_for_128:
+.L_final_reduction_for_128_\@:
 	// check if any more data to fold. If not, compute the CRC of
 	// the final 128 bits
 	adds		arg3, arg3, #16
-	b.eq		_128_done
+	b.eq		.L_128_done_\@
 
 	// here we are getting data that is less than 16 bytes.
 	// since we know that there was data before the pointer, we can
 	// offset the input pointer before the actual point, to receive
 	// exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
+.L_get_last_two_regs_\@:
 	add		arg2, arg2, arg3
 	ldr		q1, [arg2, #-16]
 CPU_LE(	rev64		v1.16b, v1.16b			)
@@ -291,47 +444,48 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 	bsl		v0.16b, v2.16b, v1.16b
 
 	// fold 16 Bytes
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
+	__pmull_\p	v8, v7, v10
+	__pmull_\p	v7, v7, v10, 2
 	eor		v7.16b, v7.16b, v8.16b
 	eor		v7.16b, v7.16b, v0.16b
 
-_128_done:
+.L_128_done_\@:
 	// compute crc of a 128-bit value
 	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
+	__pmull_pre_\p	v10
 
 	// 64b fold
 	ext		v0.16b, vzr.16b, v7.16b, #8
 	mov		v7.d[0], v7.d[1]
-	pmull		v7.1q, v7.1d, v10.1d
+	__pmull_\p	v7, v7, v10
 	eor		v7.16b, v7.16b, v0.16b
 
 	// 32b fold
 	ext		v0.16b, v7.16b, vzr.16b, #4
 	mov		v7.s[3], vzr.s[0]
-	pmull2		v0.1q, v0.2d, v10.2d
+	__pmull_\p	v0, v0, v10, 2
 	eor		v7.16b, v7.16b, v0.16b
 
 	// barrett reduction
-_barrett:
 	ldr_l		q10, rk7, x8
+	__pmull_pre_\p	v10
 	mov		v0.d[0], v7.d[1]
 
-	pmull		v0.1q, v0.1d, v10.1d
+	__pmull_\p	v0, v0, v10
 	ext		v0.16b, vzr.16b, v0.16b, #12
-	pmull2		v0.1q, v0.2d, v10.2d
+	__pmull_\p	v0, v0, v10, 2
 	ext		v0.16b, vzr.16b, v0.16b, #12
 	eor		v7.16b, v7.16b, v0.16b
 	mov		w0, v7.s[1]
 
-_cleanup:
+.L_cleanup_\@:
 	// scale the result back to 16 bits
 	lsr		x0, x0, #16
 	frame_pop
 	ret
 
-_less_than_128:
-	cbz		arg3, _cleanup
+.L_less_than_128_\@:
+	cbz		arg3, .L_cleanup_\@
 
 	movi		v0.16b, #0
 	mov		v0.s[3], arg1_low32	// get the initial crc value
@@ -342,20 +496,21 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
 
 	cmp		arg3, #16
-	b.eq		_128_done		// exactly 16 left
-	b.lt		_less_than_16_left
+	b.eq		.L_128_done_\@		// exactly 16 left
+	b.lt		.L_less_than_16_left_\@
 
 	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
+	__pmull_pre_\p	v10
 
 	// update the counter. subtract 32 instead of 16 to save one
 	// instruction from the loop
 	subs		arg3, arg3, #32
-	b.ge		_16B_reduction_loop
+	b.ge		.L_16B_reduction_loop_\@
 
 	add		arg3, arg3, #16
-	b		_get_last_two_regs
+	b		.L_get_last_two_regs_\@
 
-_less_than_16_left:
+.L_less_than_16_left_\@:
 	// shl r9, 4
 	adr_l		x0, tbl_shf_table + 16
 	sub		x0, x0, arg3
@@ -363,8 +518,17 @@ _less_than_16_left:
 	movi		v9.16b, #0x80
 	eor		v0.16b, v0.16b, v9.16b
 	tbl		v7.16b, {v7.16b}, v0.16b
-	b		_128_done
-ENDPROC(crc_t10dif_pmull)
+	b		.L_128_done_\@
+	.endm
+
+ENTRY(crc_t10dif_pmull_p8)
+	crc_t10dif_pmull	p8
+ENDPROC(crc_t10dif_pmull_p8)
+
+	.align		5
+ENTRY(crc_t10dif_pmull_p64)
+	crc_t10dif_pmull	p64
+ENDPROC(crc_t10dif_pmull_p64)
 
 // precomputed constants
 // these constants are precomputed from the poly:
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 96f0cae4a022..b461d62023f2 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -22,7 +22,10 @@
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
+
+static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
 
 static int crct10dif_init(struct shash_desc *desc)
 {
@@ -85,6 +88,11 @@ static struct shash_alg crc_t10dif_alg = {
 
 static int __init crc_t10dif_mod_init(void)
 {
+	if (elf_hwcap & HWCAP_PMULL)
+		crc_t10dif_pmull = crc_t10dif_pmull_p64;
+	else
+		crc_t10dif_pmull = crc_t10dif_pmull_p8;
+
 	return crypto_register_shash(&crc_t10dif_alg);
 }
 
@@ -93,8 +101,10 @@ static void __exit crc_t10dif_mod_exit(void)
 	crypto_unregister_shash(&crc_t10dif_alg);
 }
 
-module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
 module_exit(crc_t10dif_mod_exit);
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/crypto/speck-neon-core.S b/arch/arm64/crypto/speck-neon-core.S
deleted file mode 100644
index b14463438b09..000000000000
--- a/arch/arm64/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-
-	// arguments
-	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
-	NROUNDS		.req	w1	// int nrounds
-	NROUNDS_X	.req	x1
-	DST		.req	x2	// void *dst
-	SRC		.req	x3	// const void *src
-	NBYTES		.req	w4	// unsigned int nbytes
-	TWEAK		.req	x5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	// (underscores avoid a naming collision with ARM64 registers x0-x3)
-	X_0		.req	v0
-	Y_0		.req	v1
-	X_1		.req	v2
-	Y_1		.req	v3
-	X_2		.req	v4
-	Y_2		.req	v5
-	X_3		.req	v6
-	Y_3		.req	v7
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	v8
-
-	// index vector for tbl-based 8-bit rotates
-	ROTATE_TABLE	.req	v9
-	ROTATE_TABLE_Q	.req	q9
-
-	// temporary registers
-	TMP0		.req	v10
-	TMP1		.req	v11
-	TMP2		.req	v12
-	TMP3		.req	v13
-
-	// multiplication table for updating XTS tweaks
-	GFMUL_TABLE	.req	v14
-	GFMUL_TABLE_Q	.req	q14
-
-	// next XTS tweak value(s)
-	TWEAKV_NEXT	.req	v15
-
-	// XTS tweaks for the blocks currently being encrypted/decrypted
-	TWEAKV0		.req	v16
-	TWEAKV1		.req	v17
-	TWEAKV2		.req	v18
-	TWEAKV3		.req	v19
-	TWEAKV4		.req	v20
-	TWEAKV5		.req	v21
-	TWEAKV6		.req	v22
-	TWEAKV7		.req	v23
-
-	.align		4
-.Lror64_8_table:
-	.octa		0x080f0e0d0c0b0a090007060504030201
-.Lror32_8_table:
-	.octa		0x0c0f0e0d080b0a090407060500030201
-.Lrol64_8_table:
-	.octa		0x0e0d0c0b0a09080f0605040302010007
-.Lrol32_8_table:
-	.octa		0x0e0d0c0f0a09080b0605040702010003
-.Lgf128mul_table:
-	.octa		0x00000000000000870000000000000001
-.Lgf64mul_table:
-	.octa		0x0000000000000000000000002d361b00
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
- */
-.macro _speck_round_128bytes	n, lanes
-
-	// x = ror(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-
-	// x += y
-	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// y = rol(y, 3)
-	shl		TMP0.\lanes, Y_0.\lanes, #3
-	shl		TMP1.\lanes, Y_1.\lanes, #3
-	shl		TMP2.\lanes, Y_2.\lanes, #3
-	shl		TMP3.\lanes, Y_3.\lanes, #3
-	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
-	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
-	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
-	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)
-
-	// y ^= x
-	eor		Y_0.16b, TMP0.16b, X_0.16b
-	eor		Y_1.16b, TMP1.16b, X_1.16b
-	eor		Y_2.16b, TMP2.16b, X_2.16b
-	eor		Y_3.16b, TMP3.16b, X_3.16b
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n, lanes
-
-	// y ^= x
-	eor		TMP0.16b, Y_0.16b, X_0.16b
-	eor		TMP1.16b, Y_1.16b, X_1.16b
-	eor		TMP2.16b, Y_2.16b, X_2.16b
-	eor		TMP3.16b, Y_3.16b, X_3.16b
-
-	// y = ror(y, 3)
-	ushr		Y_0.\lanes, TMP0.\lanes, #3
-	ushr		Y_1.\lanes, TMP1.\lanes, #3
-	ushr		Y_2.\lanes, TMP2.\lanes, #3
-	ushr		Y_3.\lanes, TMP3.\lanes, #3
-	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
-	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
-	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
-	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// x -= y
-	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x = rol(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-.endm
-
-.macro _next_xts_tweak	next, cur, tmp, n
-.if \n == 64
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	sshr		\tmp\().2d, \cur\().2d, #63
-	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
-	shl		\next\().2d, \cur\().2d, #1
-	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.else
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	ushr		\tmp\().2d, \cur\().2d, #62
-	shl		\next\().2d, \cur\().2d, #2
-	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.endif
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, lanes, decrypting
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
-	sub		ROUND_KEYS, ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
-	sub		ROUND_KEYS, ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for tbl-based 8-bit rotates
-.if \decrypting
-	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
-.else
-	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
-.endif
-
-	// One-time XTS preparation
-.if \n == 64
-	// Load first tweak
-	ld1		{TWEAKV0.16b}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
-.else
-	// Load first tweak
-	ld1		{TWEAKV0.8b}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf64mul_table
-
-	// Calculate second tweak, packing it together with the first
-	ushr		TMP0.2d, TWEAKV0.2d, #63
-	shl		TMP1.2d, TWEAKV0.2d, #1
-	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
-	eor		TMP0.8b, TMP0.8b, TMP1.8b
-	mov		TWEAKV0.d[1], TMP0.d[0]
-.endif
-
-.Lnext_128bytes_\@:
-
-	// Calculate XTS tweaks for next 128 bytes
-	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
-	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
-	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
-	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
-	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
-	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
-	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
-	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n
-
-	// Load the next source blocks into {X,Y}[0-3]
-	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
-	ld1		{X_2.16b-Y_3.16b}, [SRC], #64
-
-	// XOR the source blocks with their XTS tweaks
-	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
-	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
-	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
-	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
-	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
-	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
-	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
-	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b
-
-	/*
-	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
-	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
-
-	// Do the cipher rounds
-	mov		x6, ROUND_KEYS
-	mov		w7, NROUNDS
-.Lnext_round_\@:
-.if \decrypting
-	ld1r		{ROUND_KEY.\lanes}, [x6]
-	sub		x6, x6, #( \n / 8 )
-	_speck_unround_128bytes	\n, \lanes
-.else
-	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
-	_speck_round_128bytes	\n, \lanes
-.endif
-	subs		w7, w7, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
-	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
-	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
-	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
-	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
-	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
-	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
-	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
-
-	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
-	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
-	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
-	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
-	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
-	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
-	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
-	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
-	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
-	mov		TWEAKV0.16b, TWEAKV_NEXT.16b
-
-	// Store the ciphertext in the destination buffer
-	st1		{X_0.16b-Y_1.16b}, [DST], #64
-	st1		{X_2.16b-Y_3.16b}, [DST], #64
-
-	// Continue if there are more 128-byte chunks remaining
-	subs		NBYTES, NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak and return
-.if \n == 64
-	st1		{TWEAKV_NEXT.16b}, [TWEAK]
-.else
-	st1		{TWEAKV_NEXT.8b}, [TWEAK]
-.endif
-	ret
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm64/crypto/speck-neon-glue.c b/arch/arm64/crypto/speck-neon-glue.c
deleted file mode 100644
index 6e233aeb4ff4..000000000000
--- a/arch/arm64/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,282 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- * (64-bit version; based on the 32-bit version)
- *
- * Copyright (c) 2018 Google, Inc
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble(&tweak, &tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
-	{
-		.base.cra_name		= "xts(speck128)",
-		.base.cra_driver_name	= "xts-speck128-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK128_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck128_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-		.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-		.ivsize			= SPECK128_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck128_xts_setkey,
-		.encrypt		= speck128_xts_encrypt,
-		.decrypt		= speck128_xts_decrypt,
-	}, {
-		.base.cra_name		= "xts(speck64)",
-		.base.cra_driver_name	= "xts-speck64-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK64_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck64_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-		.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-		.ivsize			= SPECK64_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck64_xts_setkey,
-		.encrypt		= speck64_xts_encrypt,
-		.decrypt		= speck64_xts_decrypt,
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
-	return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 0bcc98dbba56..6142402c2eb4 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -286,12 +286,11 @@ alternative_endif
 	ldr	\rd, [\rn, #MM_CONTEXT_ID]
 	.endm
 /*
- * read_ctr - read CTR_EL0. If the system has mismatched
- * cache line sizes, provide the system wide safe value
- * from arm64_ftr_reg_ctrel0.sys_val
+ * read_ctr - read CTR_EL0. If the system has mismatched register fields,
+ * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
  */
 	.macro	read_ctr, reg
-alternative_if_not ARM64_MISMATCHED_CACHE_LINE_SIZE
+alternative_if_not ARM64_MISMATCHED_CACHE_TYPE
 	mrs	\reg, ctr_el0			// read CTR
 	nop
 alternative_else
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 5ee5bca8c24b..13dd42c3ad4e 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -40,6 +40,15 @@
 #define L1_CACHE_SHIFT		(6)
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
+
+#define CLIDR_LOUU_SHIFT	27
+#define CLIDR_LOC_SHIFT		24
+#define CLIDR_LOUIS_SHIFT	21
+
+#define CLIDR_LOUU(clidr)	(((clidr) >> CLIDR_LOUU_SHIFT) & 0x7)
+#define CLIDR_LOC(clidr)	(((clidr) >> CLIDR_LOC_SHIFT) & 0x7)
+#define CLIDR_LOUIS(clidr)	(((clidr) >> CLIDR_LOUIS_SHIFT) & 0x7)
+
 /*
  * Memory returned by kmalloc() may be used for DMA, so we must make
  * sure that all such allocations are cache aligned. Otherwise,
@@ -84,6 +93,37 @@ static inline int cache_line_size(void)
 	return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
 }
 
+/*
+ * Read the effective value of CTR_EL0.
+ *
+ * According to ARM ARM for ARMv8-A (ARM DDI 0487C.a),
+ * section D10.2.33 "CTR_EL0, Cache Type Register" :
+ *
+ * CTR_EL0.IDC reports the data cache clean requirements for
+ * instruction to data coherence.
+ *
+ *  0 - dcache clean to PoU is required unless :
+ *     (CLIDR_EL1.LoC == 0) || (CLIDR_EL1.LoUIS == 0 && CLIDR_EL1.LoUU == 0)
+ *  1 - dcache clean to PoU is not required for i-to-d coherence.
+ *
+ * This routine provides the CTR_EL0 with the IDC field updated to the
+ * effective state.
+ */
+static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void)
+{
+	u32 ctr = read_cpuid_cachetype();
+
+	if (!(ctr & BIT(CTR_IDC_SHIFT))) {
+		u64 clidr = read_sysreg(clidr_el1);
+
+		if (CLIDR_LOC(clidr) == 0 ||
+		    (CLIDR_LOUIS(clidr) == 0 && CLIDR_LOUU(clidr) == 0))
+			ctr |= BIT(CTR_IDC_SHIFT);
+	}
+
+	return ctr;
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 1a037b94eba1..93ce86d5dae1 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -25,6 +25,8 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 
+#include <asm-generic/compat.h>
+
 #define COMPAT_USER_HZ		100
 #ifdef __AARCH64EB__
 #define COMPAT_UTS_MACHINE	"armv8b\0\0"
@@ -32,10 +34,6 @@
 #define COMPAT_UTS_MACHINE	"armv8l\0\0"
 #endif
 
-typedef u32		compat_size_t;
-typedef s32		compat_ssize_t;
-typedef s32		compat_clock_t;
-typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
 typedef u16		__compat_gid_t;
 typedef u16		__compat_uid16_t;
@@ -43,27 +41,13 @@ typedef u16		__compat_gid16_t;
 typedef u32		__compat_uid32_t;
 typedef u32		__compat_gid32_t;
 typedef u16		compat_mode_t;
-typedef u32		compat_ino_t;
 typedef u32		compat_dev_t;
-typedef s32		compat_off_t;
-typedef s64		compat_loff_t;
 typedef s32		compat_nlink_t;
 typedef u16		compat_ipc_pid_t;
-typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
-typedef s32		compat_key_t;
-typedef s32		compat_timer_t;
-
-typedef s16		compat_short_t;
-typedef s32		compat_int_t;
-typedef s32		compat_long_t;
 typedef s64		compat_s64;
-typedef u16		compat_ushort_t;
-typedef u32		compat_uint_t;
-typedef u32		compat_ulong_t;
 typedef u64		compat_u64;
-typedef u32		compat_uptr_t;
 
 struct compat_stat {
 #ifdef __AARCH64EB__
@@ -86,11 +70,11 @@ struct compat_stat {
 	compat_off_t	st_size;
 	compat_off_t	st_blksize;
 	compat_off_t	st_blocks;
-	compat_time_t	st_atime;
+	old_time32_t	st_atime;
 	compat_ulong_t	st_atime_nsec;
-	compat_time_t	st_mtime;
+	old_time32_t	st_mtime;
 	compat_ulong_t	st_mtime_nsec;
-	compat_time_t	st_ctime;
+	old_time32_t	st_ctime;
 	compat_ulong_t	st_ctime_nsec;
 	compat_ulong_t	__unused4[2];
 };
@@ -159,6 +143,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 }
 
 #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current)))
+#define COMPAT_MINSIGSTKSZ	2048
 
 static inline void __user *arch_compat_alloc_user_space(long len)
 {
diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h
deleted file mode 100644
index ee35fd0f2236..000000000000
--- a/arch/arm64/include/asm/compiler.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Based on arch/arm/include/asm/compiler.h
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_COMPILER_H
-#define __ASM_COMPILER_H
-
-/*
- * This is used to ensure the compiler did actually allocate the register we
- * asked it for some inline assembly sequences.  Apparently we can't trust the
- * compiler from one version to another so a bit of paranoia won't hurt.  This
- * string is meant to be concatenated with the inline asm string and will
- * cause compilation to stop on mismatch.  (for details, see gcc PR 15089)
- */
-#define __asmeq(x, y)  ".ifnc " x "," y " ; .err ; .endif\n\t"
-
-#endif	/* __ASM_COMPILER_H */
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ae1f70450fb2..6e2d254c09eb 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -33,7 +33,7 @@
 #define ARM64_WORKAROUND_CAVIUM_27456		12
 #define ARM64_HAS_32BIT_EL0			13
 #define ARM64_HARDEN_EL2_VECTORS		14
-#define ARM64_MISMATCHED_CACHE_LINE_SIZE	15
+#define ARM64_HAS_CNP				15
 #define ARM64_HAS_NO_FPSIMD			16
 #define ARM64_WORKAROUND_REPEAT_TLBI		17
 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003	18
@@ -51,7 +51,10 @@
 #define ARM64_SSBD				30
 #define ARM64_MISMATCHED_CACHE_TYPE		31
 #define ARM64_HAS_STAGE2_FWB			32
+#define ARM64_HAS_CRC32				33
+#define ARM64_SSBS				34
+#define ARM64_WORKAROUND_1188873		35
 
-#define ARM64_NCAPS				33
+#define ARM64_NCAPS				36
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 1717ba1db35d..7e2ec64aa414 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -262,7 +262,7 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
 /*
  * CPU feature detected at boot time based on system-wide value of a
  * feature. It is safe for a late CPU to have this feature even though
- * the system hasn't enabled it, although the featuer will not be used
+ * the system hasn't enabled it, although the feature will not be used
  * by Linux in this case. If the system has enabled this feature already,
  * then every late CPU must have it.
  */
@@ -508,6 +508,12 @@ static inline bool system_supports_sve(void)
 		cpus_have_const_cap(ARM64_SVE);
 }
 
+static inline bool system_supports_cnp(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_CNP) &&
+		cpus_have_const_cap(ARM64_HAS_CNP);
+}
+
 #define ARM64_SSBD_UNKNOWN		-1
 #define ARM64_SSBD_FORCE_DISABLE	0
 #define ARM64_SSBD_KERNEL		1
@@ -530,6 +536,28 @@ void arm64_set_ssbd_mitigation(bool state);
 static inline void arm64_set_ssbd_mitigation(bool state) {}
 #endif
 
+extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
+
+static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+{
+	switch (parange) {
+	case 0: return 32;
+	case 1: return 36;
+	case 2: return 40;
+	case 3: return 42;
+	case 4: return 44;
+	case 5: return 48;
+	case 6: return 52;
+	/*
+	 * A future PE could use a value unknown to the kernel.
+	 * However, by the "D10.1.4 Principles of the ID scheme
+	 * for fields in ID registers", ARM DDI 0487C.a, any new
+	 * value is guaranteed to be higher than what we know already.
+	 * As a safe limit, we return the limit supported by the kernel.
+	 */
+	default: return CONFIG_ARM64_PA_BITS;
+	}
+}
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index ea690b3562af..12f93e4d2452 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -86,6 +86,7 @@
 #define ARM_CPU_PART_CORTEX_A75		0xD0A
 #define ARM_CPU_PART_CORTEX_A35		0xD04
 #define ARM_CPU_PART_CORTEX_A55		0xD05
+#define ARM_CPU_PART_CORTEX_A76		0xD0B
 
 #define APM_CPU_PART_POTENZA		0x000
 
@@ -110,6 +111,7 @@
 #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
 #define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35)
 #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55)
+#define MIDR_CORTEX_A76	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h
index 22e4c83de5a5..8d91f2233135 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -36,11 +36,8 @@ static inline unsigned long local_daif_save(void)
 {
 	unsigned long flags;
 
-	asm volatile(
-		"mrs	%0, daif		// local_daif_save\n"
-		: "=r" (flags)
-		:
-		: "memory");
+	flags = arch_local_save_flags();
+
 	local_daif_mask();
 
 	return flags;
@@ -60,11 +57,9 @@ static inline void local_daif_restore(unsigned long flags)
 {
 	if (!arch_irqs_disabled_flags(flags))
 		trace_hardirqs_on();
-	asm volatile(
-		"msr	daif, %0		// local_daif_restore"
-		:
-		: "r" (flags)
-		: "memory");
+
+	arch_local_irq_restore(flags);
+
 	if (arch_irqs_disabled_flags(flags))
 		trace_hardirqs_off();
 }
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index ce70c3ffb993..676de2ec1762 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -137,6 +137,8 @@
 #define ESR_ELx_CV		(UL(1) << 24)
 #define ESR_ELx_COND_SHIFT	(20)
 #define ESR_ELx_COND_MASK	(UL(0xF) << ESR_ELx_COND_SHIFT)
+#define ESR_ELx_WFx_ISS_TI	(UL(1) << 0)
+#define ESR_ELx_WFx_ISS_WFI	(UL(0) << 0)
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
@@ -148,6 +150,9 @@
 #define DISR_EL1_ESR_MASK	(ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC)
 
 /* ESR value templates for specific events */
+#define ESR_ELx_WFx_MASK	(ESR_ELx_EC_MASK | ESR_ELx_WFx_ISS_TI)
+#define ESR_ELx_WFx_WFI_VAL	((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) |	\
+				 ESR_ELx_WFx_ISS_WFI)
 
 /* BRK instruction trap from AArch64 state */
 #define ESR_ELx_VAL_BRK64(imm)					\
@@ -187,6 +192,8 @@
 
 #define ESR_ELx_SYS64_ISS_SYS_OP_MASK	(ESR_ELx_SYS64_ISS_SYS_MASK | \
 					 ESR_ELx_SYS64_ISS_DIR_MASK)
+#define ESR_ELx_SYS64_ISS_RT(esr) \
+	(((esr) & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT)
 /*
  * User space cache operations have the following sysreg encoding
  * in System instructions.
@@ -206,6 +213,18 @@
 #define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL \
 				(ESR_ELx_SYS64_ISS_SYS_VAL(1, 3, 1, 7, 0) | \
 				 ESR_ELx_SYS64_ISS_DIR_WRITE)
+/*
+ * User space MRS operations which are supported for emulation
+ * have the following sysreg encoding in System instructions.
+ * op0 = 3, op1= 0, crn = 0, {crm = 0, 4-7}, READ (L = 1)
+ */
+#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK	(ESR_ELx_SYS64_ISS_OP0_MASK | \
+						 ESR_ELx_SYS64_ISS_OP1_MASK | \
+						 ESR_ELx_SYS64_ISS_CRN_MASK | \
+						 ESR_ELx_SYS64_ISS_DIR_MASK)
+#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL \
+				(ESR_ELx_SYS64_ISS_SYS_VAL(3, 0, 0, 0, 0) | \
+				 ESR_ELx_SYS64_ISS_DIR_READ)
 
 #define ESR_ELx_SYS64_ISS_SYS_CTR	ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 1, 0, 0)
 #define ESR_ELx_SYS64_ISS_SYS_CTR_READ	(ESR_ELx_SYS64_ISS_SYS_CTR | \
@@ -249,6 +268,64 @@
 
 #define ESR_ELx_FP_EXC_TFV	(UL(1) << 23)
 
+/*
+ * ISS field definitions for CP15 accesses
+ */
+#define ESR_ELx_CP15_32_ISS_DIR_MASK	0x1
+#define ESR_ELx_CP15_32_ISS_DIR_READ	0x1
+#define ESR_ELx_CP15_32_ISS_DIR_WRITE	0x0
+
+#define ESR_ELx_CP15_32_ISS_RT_SHIFT	5
+#define ESR_ELx_CP15_32_ISS_RT_MASK	(UL(0x1f) << ESR_ELx_CP15_32_ISS_RT_SHIFT)
+#define ESR_ELx_CP15_32_ISS_CRM_SHIFT	1
+#define ESR_ELx_CP15_32_ISS_CRM_MASK	(UL(0xf) << ESR_ELx_CP15_32_ISS_CRM_SHIFT)
+#define ESR_ELx_CP15_32_ISS_CRN_SHIFT	10
+#define ESR_ELx_CP15_32_ISS_CRN_MASK	(UL(0xf) << ESR_ELx_CP15_32_ISS_CRN_SHIFT)
+#define ESR_ELx_CP15_32_ISS_OP1_SHIFT	14
+#define ESR_ELx_CP15_32_ISS_OP1_MASK	(UL(0x7) << ESR_ELx_CP15_32_ISS_OP1_SHIFT)
+#define ESR_ELx_CP15_32_ISS_OP2_SHIFT	17
+#define ESR_ELx_CP15_32_ISS_OP2_MASK	(UL(0x7) << ESR_ELx_CP15_32_ISS_OP2_SHIFT)
+
+#define ESR_ELx_CP15_32_ISS_SYS_MASK	(ESR_ELx_CP15_32_ISS_OP1_MASK | \
+					 ESR_ELx_CP15_32_ISS_OP2_MASK | \
+					 ESR_ELx_CP15_32_ISS_CRN_MASK | \
+					 ESR_ELx_CP15_32_ISS_CRM_MASK | \
+					 ESR_ELx_CP15_32_ISS_DIR_MASK)
+#define ESR_ELx_CP15_32_ISS_SYS_VAL(op1, op2, crn, crm) \
+					(((op1) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) | \
+					 ((op2) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) | \
+					 ((crn) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) | \
+					 ((crm) << ESR_ELx_CP15_32_ISS_CRM_SHIFT))
+
+#define ESR_ELx_CP15_64_ISS_DIR_MASK	0x1
+#define ESR_ELx_CP15_64_ISS_DIR_READ	0x1
+#define ESR_ELx_CP15_64_ISS_DIR_WRITE	0x0
+
+#define ESR_ELx_CP15_64_ISS_RT_SHIFT	5
+#define ESR_ELx_CP15_64_ISS_RT_MASK	(UL(0x1f) << ESR_ELx_CP15_64_ISS_RT_SHIFT)
+
+#define ESR_ELx_CP15_64_ISS_RT2_SHIFT	10
+#define ESR_ELx_CP15_64_ISS_RT2_MASK	(UL(0x1f) << ESR_ELx_CP15_64_ISS_RT2_SHIFT)
+
+#define ESR_ELx_CP15_64_ISS_OP1_SHIFT	16
+#define ESR_ELx_CP15_64_ISS_OP1_MASK	(UL(0xf) << ESR_ELx_CP15_64_ISS_OP1_SHIFT)
+#define ESR_ELx_CP15_64_ISS_CRM_SHIFT	1
+#define ESR_ELx_CP15_64_ISS_CRM_MASK	(UL(0xf) << ESR_ELx_CP15_64_ISS_CRM_SHIFT)
+
+#define ESR_ELx_CP15_64_ISS_SYS_VAL(op1, crm) \
+					(((op1) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) | \
+					 ((crm) << ESR_ELx_CP15_64_ISS_CRM_SHIFT))
+
+#define ESR_ELx_CP15_64_ISS_SYS_MASK	(ESR_ELx_CP15_64_ISS_OP1_MASK |	\
+					 ESR_ELx_CP15_64_ISS_CRM_MASK | \
+					 ESR_ELx_CP15_64_ISS_DIR_MASK)
+
+#define ESR_ELx_CP15_64_ISS_SYS_CNTVCT	(ESR_ELx_CP15_64_ISS_SYS_VAL(1, 14) | \
+					 ESR_ELx_CP15_64_ISS_DIR_READ)
+
+#define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ	(ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\
+					 ESR_ELx_CP15_32_ISS_DIR_READ)
+
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 35b2e50f17fb..9f8b915af3a7 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -31,8 +31,6 @@
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
 
-#include <xen/xen.h>
-
 /*
  * Generic IO read/write.  These perform native-endian accesses.
  */
@@ -205,12 +203,5 @@ extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
 
 extern int devmem_is_allowed(unsigned long pfn);
 
-struct bio_vec;
-extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
-				      const struct bio_vec *vec2);
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)				\
-	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&				\
-	 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-
 #endif	/* __KERNEL__ */
 #endif	/* __ASM_IO_H */
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index 7e2b3e360086..472023498d71 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -26,13 +26,16 @@
 
 #define JUMP_LABEL_NOP_SIZE		AARCH64_INSN_SIZE
 
-static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+static __always_inline bool arch_static_branch(struct static_key *key,
+					       bool branch)
 {
-	asm_volatile_goto("1: nop\n\t"
-		 ".pushsection __jump_table,  \"aw\"\n\t"
-		 ".align 3\n\t"
-		 ".quad 1b, %l[l_yes], %c0\n\t"
-		 ".popsection\n\t"
+	asm_volatile_goto(
+		"1:	nop					\n\t"
+		 "	.pushsection	__jump_table, \"aw\"	\n\t"
+		 "	.align		3			\n\t"
+		 "	.long		1b - ., %l[l_yes] - .	\n\t"
+		 "	.quad		%c0 - .			\n\t"
+		 "	.popsection				\n\t"
 		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
 
 	return false;
@@ -40,13 +43,16 @@ l_yes:
 	return true;
 }
 
-static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+static __always_inline bool arch_static_branch_jump(struct static_key *key,
+						    bool branch)
 {
-	asm_volatile_goto("1: b %l[l_yes]\n\t"
-		 ".pushsection __jump_table,  \"aw\"\n\t"
-		 ".align 3\n\t"
-		 ".quad 1b, %l[l_yes], %c0\n\t"
-		 ".popsection\n\t"
+	asm_volatile_goto(
+		"1:	b		%l[l_yes]		\n\t"
+		 "	.pushsection	__jump_table, \"aw\"	\n\t"
+		 "	.align		3			\n\t"
+		 "	.long		1b - ., %l[l_yes] - .	\n\t"
+		 "	.quad		%c0 - .			\n\t"
+		 "	.popsection				\n\t"
 		 :  :  "i"(&((char *)key)[branch]) :  : l_yes);
 
 	return false;
@@ -54,13 +60,5 @@ l_yes:
 	return true;
 }
 
-typedef u64 jump_label_t;
-
-struct jump_entry {
-	jump_label_t code;
-	jump_label_t target;
-	jump_label_t key;
-};
-
 #endif  /* __ASSEMBLY__ */
 #endif	/* __ASM_JUMP_LABEL_H */
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index a780f6714b44..850e2122d53f 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -97,7 +97,7 @@
 			+ EARLY_PGDS((vstart), (vend)) 	/* each PGDIR needs a next level page table */	\
 			+ EARLY_PUDS((vstart), (vend))	/* each PUD needs a next level page table */	\
 			+ EARLY_PMDS((vstart), (vend)))	/* each PMD needs a next level page table */
-#define SWAPPER_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end))
+#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end))
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index aa45df752a16..6f602af5263c 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
 #define VTCR_EL2_RES1		(1 << 31)
 #define VTCR_EL2_HD		(1 << 22)
 #define VTCR_EL2_HA		(1 << 21)
+#define VTCR_EL2_PS_SHIFT	TCR_EL2_PS_SHIFT
 #define VTCR_EL2_PS_MASK	TCR_EL2_PS_MASK
 #define VTCR_EL2_TG0_MASK	TCR_TG0_MASK
 #define VTCR_EL2_TG0_4K		TCR_TG0_4K
@@ -120,62 +121,150 @@
 #define VTCR_EL2_IRGN0_WBWA	TCR_IRGN0_WBWA
 #define VTCR_EL2_SL0_SHIFT	6
 #define VTCR_EL2_SL0_MASK	(3 << VTCR_EL2_SL0_SHIFT)
-#define VTCR_EL2_SL0_LVL1	(1 << VTCR_EL2_SL0_SHIFT)
 #define VTCR_EL2_T0SZ_MASK	0x3f
-#define VTCR_EL2_T0SZ_40B	24
 #define VTCR_EL2_VS_SHIFT	19
 #define VTCR_EL2_VS_8BIT	(0 << VTCR_EL2_VS_SHIFT)
 #define VTCR_EL2_VS_16BIT	(1 << VTCR_EL2_VS_SHIFT)
 
+#define VTCR_EL2_T0SZ(x)	TCR_T0SZ(x)
+
 /*
  * We configure the Stage-2 page tables to always restrict the IPA space to be
  * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
  * not known to exist and will break with this configuration.
  *
- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
- * (see hyp-init.S).
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
  *
  * Note that when using 4K pages, we concatenate two first level page tables
  * together. With 16K pages, we concatenate 16 first level page tables.
  *
- * The magic numbers used for VTTBR_X in this patch can be found in Tables
- * D4-23 and D4-25 in ARM DDI 0487A.b.
  */
 
-#define VTCR_EL2_T0SZ_IPA	VTCR_EL2_T0SZ_40B
 #define VTCR_EL2_COMMON_BITS	(VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
 				 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
 
-#ifdef CONFIG_ARM64_64K_PAGES
 /*
- * Stage2 translation configuration:
- * 64kB pages (TG0 = 1)
- * 2 level page tables (SL = 1)
+ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
+ * Interestingly, it depends on the page size.
+ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
+ *
+ *	-----------------------------------------
+ *	| Entry level		|  4K  | 16K/64K |
+ *	------------------------------------------
+ *	| Level: 0		|  2   |   -     |
+ *	------------------------------------------
+ *	| Level: 1		|  1   |   2     |
+ *	------------------------------------------
+ *	| Level: 2		|  0   |   1     |
+ *	------------------------------------------
+ *	| Level: 3		|  -   |   0     |
+ *	------------------------------------------
+ *
+ * The table roughly translates to :
+ *
+ *	SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+ *
+ * Where TGRAN_SL0_BASE is a magic number depending on the page size:
+ * 	TGRAN_SL0_BASE(4K) = 2
+ *	TGRAN_SL0_BASE(16K) = 3
+ *	TGRAN_SL0_BASE(64K) = 3
+ * provided we take care of ruling out the unsupported cases and
+ * Entry_Level = 4 - Number_of_levels.
+ *
  */
-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC		38
+#ifdef CONFIG_ARM64_64K_PAGES
+
+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_64K
+#define VTCR_EL2_TGRAN_SL0_BASE		3UL
+
 #elif defined(CONFIG_ARM64_16K_PAGES)
-/*
- * Stage2 translation configuration:
- * 16kB pages (TG0 = 2)
- * 2 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC		42
+
+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_16K
+#define VTCR_EL2_TGRAN_SL0_BASE		3UL
+
 #else	/* 4K */
-/*
- * Stage2 translation configuration:
- * 4kB pages (TG0 = 0)
- * 3 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC		37
+
+#define VTCR_EL2_TGRAN			VTCR_EL2_TG0_4K
+#define VTCR_EL2_TGRAN_SL0_BASE		2UL
+
 #endif
 
-#define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
-#define VTTBR_X				(VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+#define VTCR_EL2_LVLS_TO_SL0(levels)	\
+	((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
+#define VTCR_EL2_SL0_TO_LVLS(sl0)	\
+	((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
+#define VTCR_EL2_LVLS(vtcr)		\
+	VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
 
-#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
+#define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
+#define VTCR_EL2_IPA(vtcr)		(64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
+
+/*
+ * ARM VMSAv8-64 defines an algorithm for finding the translation table
+ * descriptors in section D4.2.8 in ARM DDI 0487C.a.
+ *
+ * The algorithm defines the expectations on the translation table
+ * addresses for each level, based on PAGE_SIZE, entry level
+ * and the translation table size (T0SZ). The variable "x" in the
+ * algorithm determines the alignment of a table base address at a given
+ * level and thus determines the alignment of VTTBR:BADDR for stage2
+ * page table entry level.
+ * Since the number of bits resolved at the entry level could vary
+ * depending on the T0SZ, the value of "x" is defined based on a
+ * Magic constant for a given PAGE_SIZE and Entry Level. The
+ * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
+ * x = PAGE_SHIFT).
+ *
+ * The value of "x" for entry level is calculated as :
+ *    x = Magic_N - T0SZ
+ *
+ * where Magic_N is an integer depending on the page size and the entry
+ * level of the page table as below:
+ *
+ *	--------------------------------------------
+ *	| Entry level		|  4K    16K   64K |
+ *	--------------------------------------------
+ *	| Level: 0 (4 levels)	| 28   |  -  |  -  |
+ *	--------------------------------------------
+ *	| Level: 1 (3 levels)	| 37   | 31  | 25  |
+ *	--------------------------------------------
+ *	| Level: 2 (2 levels)	| 46   | 42  | 38  |
+ *	--------------------------------------------
+ *	| Level: 3 (1 level)	| -    | 53  | 51  |
+ *	--------------------------------------------
+ *
+ * We have a magic formula for the Magic_N below:
+ *
+ *  Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+ *
+ * where Number_of_levels = (4 - Level). We are only interested in the
+ * value for Entry_Level for the stage2 page table.
+ *
+ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
+ *
+ *	x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+ *	  = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+ *
+ * Here is one way to explain the Magic Formula:
+ *
+ *  x = log2(Size_of_Entry_Level_Table)
+ *
+ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
+ * PAGE_SHIFT bits in the PTE, we have :
+ *
+ *  Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
+ *		     = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+ *  where n = number of levels, and since each pointer is 8bytes, we have:
+ *
+ *  x = Bits_Entry_Level + 3
+ *    = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+ *
+ * The only constraint here is that, we have to find the number of page table
+ * levels for a given IPA size (which we do, see stage2_pt_levels())
+ */
+#define ARM64_VTTBR_X(ipa, levels)	((ipa) - ((levels) * (PAGE_SHIFT - 3)))
+
+#define VTTBR_CNP_BIT     (UL(1))
 #define VTTBR_VMID_SHIFT  (UL(48))
 #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
 
@@ -223,6 +312,13 @@
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK	(~UL(0xf))
+/*
+ * We have
+ *	PAR	[PA_Shift - 1	: 12] = PA	[PA_Shift - 1 : 12]
+ *	HPFAR	[PA_Shift - 9	: 4]  = FIPA	[PA_Shift - 1 : 12]
+ */
+#define PAR_TO_HPFAR(par)		\
+	(((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
 
 #define kvm_arm_exception_type	\
 	{0, "IRQ" }, 		\
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 102b5a5c47b6..aea01a09eb94 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -30,6 +30,7 @@
 #define ARM_EXCEPTION_IRQ	  0
 #define ARM_EXCEPTION_EL1_SERROR  1
 #define ARM_EXCEPTION_TRAP	  2
+#define ARM_EXCEPTION_IL	  3
 /* The hyp-stub will return this for any kvm_call_hyp() call */
 #define ARM_EXCEPTION_HYP_GONE	  HVC_STUB_ERR
 
@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
-extern u32 __init_stage2_translation(void);
-
 /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
 #define __hyp_this_cpu_ptr(sym)						\
 	({								\
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 6106a85ae0be..21247870def7 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -335,7 +335,7 @@ static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
 static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
 {
 	u32 esr = kvm_vcpu_get_hsr(vcpu);
-	return (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+	return ESR_ELx_SYS64_ISS_RT(esr);
 }
 
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d6d7336f871..52fbc823ff8c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
@@ -61,11 +61,13 @@ struct kvm_arch {
 	u64    vmid_gen;
 	u32    vmid;
 
-	/* 1-level 2nd stage table, protected by kvm->mmu_lock */
+	/* stage2 entry level table */
 	pgd_t *pgd;
 
 	/* VTTBR value associated with above pgd and vmid */
 	u64    vttbr;
+	/* VTCR_EL2 value for this VM */
+	u64    vtcr;
 
 	/* The last vcpu id that ran on each physical CPU */
 	int __percpu *last_vcpu_ran;
@@ -387,6 +389,8 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
 DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
 
+void __kvm_enable_ssbs(void);
+
 static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 				       unsigned long hyp_stack_ptr,
 				       unsigned long vector_ptr)
@@ -407,6 +411,15 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 	 */
 	BUG_ON(!static_branch_likely(&arm64_const_caps_ready));
 	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+
+	/*
+	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
+	 * at EL2.
+	 */
+	if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) &&
+	    arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+		kvm_call_hyp(__kvm_enable_ssbs);
+	}
 }
 
 static inline bool kvm_arch_check_sve_has_vhe(void)
@@ -440,13 +453,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 
-static inline void __cpu_init_stage2(void)
-{
-	u32 parange = kvm_call_hyp(__init_stage2_translation);
-
-	WARN_ONCE(parange < 40,
-		  "PARange is %d bits, unsupported configuration!", parange);
-}
+static inline void __cpu_init_stage2(void) {}
 
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -509,8 +516,12 @@ static inline int kvm_arm_have_ssbd(void)
 void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
 
+void kvm_set_ipa_limit(void);
+
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);
 void kvm_arch_free_vm(struct kvm *kvm);
 
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 384c34397619..23aca66767f9 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void);
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
 void __noreturn __hyp_do_panic(unsigned long, ...);
 
+/*
+ * Must be called from hyp code running at EL2 with an updated VTTBR
+ * and interrupts disabled.
+ */
+static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
+{
+	write_sysreg(kvm->arch.vtcr, vtcr_el2);
+	write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d6fff7de5539..658657367f2f 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
  * We currently only support a 40bit IPA.
  */
 #define KVM_PHYS_SHIFT	(40)
-#define KVM_PHYS_SIZE	(1UL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK	(KVM_PHYS_SIZE - 1UL)
+
+#define kvm_phys_shift(kvm)		VTCR_EL2_IPA(kvm->arch.vtcr)
+#define kvm_phys_size(kvm)		(_AC(1, ULL) << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm)		(kvm_phys_size(kvm) - _AC(1, ULL))
+
+static inline bool kvm_page_empty(void *ptr)
+{
+	struct page *ptr_page = virt_to_page(ptr);
+	return page_count(ptr_page) == 1;
+}
 
 #include <asm/stage2_pgtable.h>
 
@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
 	return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
 }
 
-static inline bool kvm_page_empty(void *ptr)
-{
-	struct page *ptr_page = virt_to_page(ptr);
-	return page_count(ptr_page) == 1;
-}
-
 #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
 
 #ifdef __PAGETABLE_PMD_FOLDED
@@ -517,5 +519,34 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
+/*
+ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
+ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
+ * 52bit IPS.
+ */
+static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
+{
+	int x = ARM64_VTTBR_X(ipa_shift, levels);
+
+	return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
+}
+
+static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
+{
+	unsigned int x = arm64_vttbr_x(ipa_shift, levels);
+
+	return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
+}
+
+static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
+{
+	return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
+}
+
+static inline bool kvm_cpu_has_cnp(void)
+{
+	return system_supports_cnp();
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index dd320df0d026..7689c7aa1d77 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -95,5 +95,8 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
 extern void mark_linear_text_alias_ro(void);
 
+#define INIT_MM_CONTEXT(name)	\
+	.pgd = init_pg_dir,
+
 #endif	/* !__ASSEMBLY__ */
 #endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 39ec0b8a689e..1e58bf58c22b 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -147,12 +147,25 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
 	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
 	ttbr_replace_func *replace_phys;
 
-	phys_addr_t pgd_phys = virt_to_phys(pgdp);
+	/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
+	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
+
+	if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) {
+		/*
+		 * cpu_replace_ttbr1() is used when there's a boot CPU
+		 * up (i.e. cpufeature framework is not up yet) and
+		 * latter only when we enable CNP via cpufeature's
+		 * enable() callback.
+		 * Also we rely on the cpu_hwcap bit being set before
+		 * calling the enable() function.
+		 */
+		ttbr1 |= TTBR_CNP_BIT;
+	}
 
 	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
 	cpu_install_idmap();
-	replace_phys(pgd_phys);
+	replace_phys(ttbr1);
 	cpu_uninstall_idmap();
 }
 
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 60d02c81a3a2..c88a3cb117a1 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -37,9 +37,7 @@ extern void clear_page(void *to);
 
 typedef struct page *pgtable_t;
 
-#ifdef CONFIG_HAVE_ARCH_PFN_VALID
 extern int pfn_valid(unsigned long);
-#endif
 
 #include <asm/memory.h>
 
diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
index bb5dcea42003..799d9dd6f7cc 100644
--- a/arch/arm64/include/asm/paravirt.h
+++ b/arch/arm64/include/asm/paravirt.h
@@ -10,11 +10,16 @@ extern struct static_key paravirt_steal_rq_enabled;
 struct pv_time_ops {
 	unsigned long long (*steal_clock)(int cpu);
 };
-extern struct pv_time_ops pv_time_ops;
+
+struct paravirt_patch_template {
+	struct pv_time_ops time;
+};
+
+extern struct paravirt_patch_template pv_ops;
 
 static inline u64 paravirt_steal_clock(int cpu)
 {
-	return pv_time_ops.steal_clock(cpu);
+	return pv_ops.time.steal_clock(cpu);
 }
 #endif
 
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index fd208eac9f2a..1d7d8da2ef9b 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -211,6 +211,8 @@
 #define PHYS_MASK_SHIFT		(CONFIG_ARM64_PA_BITS)
 #define PHYS_MASK		((UL(1) << PHYS_MASK_SHIFT) - 1)
 
+#define TTBR_CNP_BIT		(UL(1) << 0)
+
 /*
  * TCR flags.
  */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1bdeca8918a6..50b1ef8584c0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -360,6 +360,7 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_present(pmd)	pte_present(pmd_pte(pmd))
 #define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_valid(pmd)		pte_valid(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
@@ -428,10 +429,33 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				 PUD_TYPE_TABLE)
 #endif
 
+extern pgd_t init_pg_dir[PTRS_PER_PGD];
+extern pgd_t init_pg_end[];
+extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
+extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
+extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
+
+extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline bool in_swapper_pgdir(void *addr)
+{
+	return ((unsigned long)addr & PAGE_MASK) ==
+	        ((unsigned long)swapper_pg_dir & PAGE_MASK);
+}
+
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
+#ifdef __PAGETABLE_PMD_FOLDED
+	if (in_swapper_pgdir(pmdp)) {
+		set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
+		return;
+	}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
 	WRITE_ONCE(*pmdp, pmd);
-	dsb(ishst);
+
+	if (pmd_valid(pmd))
+		dsb(ishst);
 }
 
 static inline void pmd_clear(pmd_t *pmdp)
@@ -477,11 +501,21 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 #define pud_none(pud)		(!pud_val(pud))
 #define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
 #define pud_present(pud)	pte_present(pud_pte(pud))
+#define pud_valid(pud)		pte_valid(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
+#ifdef __PAGETABLE_PUD_FOLDED
+	if (in_swapper_pgdir(pudp)) {
+		set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud)));
+		return;
+	}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
 	WRITE_ONCE(*pudp, pud);
-	dsb(ishst);
+
+	if (pud_valid(pud))
+		dsb(ishst);
 }
 
 static inline void pud_clear(pud_t *pudp)
@@ -532,6 +566,11 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+	if (in_swapper_pgdir(pgdp)) {
+		set_swapper_pgd(pgdp, pgd);
+		return;
+	}
+
 	WRITE_ONCE(*pgdp, pgd);
 	dsb(ishst);
 }
@@ -712,11 +751,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 }
 #endif
 
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern pgd_t swapper_pg_end[];
-extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
-extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
-
 /*
  * Encode and decode a swap entry:
  *	bits 0-1:	present (must be zero)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 79657ad91397..2bf6691371c2 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -174,6 +174,10 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc,
 {
 	start_thread_common(regs, pc);
 	regs->pstate = PSR_MODE_EL0t;
+
+	if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
+		regs->pstate |= PSR_SSBS_BIT;
+
 	regs->sp = sp;
 }
 
@@ -190,6 +194,9 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
 	regs->pstate |= PSR_AA32_E_BIT;
 #endif
 
+	if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
+		regs->pstate |= PSR_AA32_SSBS_BIT;
+
 	regs->compat_sp = sp;
 }
 #endif
@@ -244,10 +251,6 @@ static inline void spin_lock_prefetch(const void *ptr)
 
 #endif
 
-void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused);
-void cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused);
-void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused);
-
 extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
 extern void __init minsigstksz_setup(void);
 
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 177b851ca6d9..fce22c4b2f73 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -25,6 +25,9 @@
 #define CurrentEL_EL1		(1 << 2)
 #define CurrentEL_EL2		(2 << 2)
 
+/* Additional SPSR bits not exposed in the UABI */
+#define PSR_IL_BIT		(1 << 20)
+
 /* AArch32-specific ptrace requests */
 #define COMPAT_PTRACE_GETREGS		12
 #define COMPAT_PTRACE_SETREGS		13
@@ -50,6 +53,7 @@
 #define PSR_AA32_I_BIT		0x00000080
 #define PSR_AA32_A_BIT		0x00000100
 #define PSR_AA32_E_BIT		0x00000200
+#define PSR_AA32_SSBS_BIT	0x00800000
 #define PSR_AA32_DIT_BIT	0x01000000
 #define PSR_AA32_Q_BIT		0x08000000
 #define PSR_AA32_V_BIT		0x10000000
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
deleted file mode 100644
index 2656a0fd05a6..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
-#define __ARM64_S2_PGTABLE_NOPMD_H_
-
-#include <asm/stage2_pgtable-nopud.h>
-
-#define __S2_PGTABLE_PMD_FOLDED
-
-#define S2_PMD_SHIFT		S2_PUD_SHIFT
-#define S2_PTRS_PER_PMD		1
-#define S2_PMD_SIZE		(1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK		(~(S2_PMD_SIZE-1))
-
-#define stage2_pud_none(pud)			(0)
-#define stage2_pud_present(pud)			(1)
-#define stage2_pud_clear(pud)			do { } while (0)
-#define stage2_pud_populate(pud, pmd)		do { } while (0)
-#define stage2_pmd_offset(pud, address)		((pmd_t *)(pud))
-
-#define stage2_pmd_free(pmd)			do { } while (0)
-
-#define stage2_pmd_addr_end(addr, end)		(end)
-
-#define stage2_pud_huge(pud)			(0)
-#define stage2_pmd_table_empty(pmdp)		(0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h
deleted file mode 100644
index 5ee87b54ebf3..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
-#define __ARM64_S2_PGTABLE_NOPUD_H_
-
-#define __S2_PGTABLE_PUD_FOLDED
-
-#define S2_PUD_SHIFT		S2_PGDIR_SHIFT
-#define S2_PTRS_PER_PUD		1
-#define S2_PUD_SIZE		(_AC(1, UL) << S2_PUD_SHIFT)
-#define S2_PUD_MASK		(~(S2_PUD_SIZE-1))
-
-#define stage2_pgd_none(pgd)			(0)
-#define stage2_pgd_present(pgd)			(1)
-#define stage2_pgd_clear(pgd)			do { } while (0)
-#define stage2_pgd_populate(pgd, pud)	do { } while (0)
-
-#define stage2_pud_offset(pgd, address)		((pud_t *)(pgd))
-
-#define stage2_pud_free(x)			do { } while (0)
-
-#define stage2_pud_addr_end(addr, end)		(end)
-#define stage2_pud_table_empty(pmdp)		(0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 8b68099348e5..d352f6df8d2c 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -19,9 +19,17 @@
 #ifndef __ARM64_S2_PGTABLE_H_
 #define __ARM64_S2_PGTABLE_H_
 
+#include <linux/hugetlb.h>
 #include <asm/pgtable.h>
 
 /*
+ * PGDIR_SHIFT determines the size a top-level page table entry can map
+ * and depends on the number of levels in the page table. Compute the
+ * PGDIR_SHIFT for a given number of levels.
+ */
+#define pt_levels_pgdir_shift(lvls)	ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
+
+/*
  * The hardware supports concatenation of up to 16 tables at stage2 entry level
  * and we use the feature whenever possible.
  *
@@ -29,112 +37,208 @@
  * On arm64, the smallest PAGE_SIZE supported is 4k, which means
  *             (PAGE_SHIFT - 3) > 4 holds for all page sizes.
  * This implies, the total number of page table levels at stage2 expected
- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
+ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
  * in normal translations(e.g, stage1), since we cannot have another level in
- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
+ * the range (IPA_SHIFT, IPA_SHIFT - 4).
  */
-#define STAGE2_PGTABLE_LEVELS		ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+#define stage2_pgtable_levels(ipa)	ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
+#define kvm_stage2_levels(kvm)		VTCR_EL2_LVLS(kvm->arch.vtcr)
 
-/*
- * With all the supported VA_BITs and 40bit guest IPA, the following condition
- * is always true:
- *
- *       STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
- *
- * We base our stage-2 page table walker helpers on this assumption and
- * fall back to using the host version of the helper wherever possible.
- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
- * to using the host version, since it is guaranteed it is not folded at host.
- *
- * If the condition breaks in the future, we can rearrange the host level
- * definitions and reuse them for stage2. Till then...
- */
-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
-#error "Unsupported combination of guest IPA and host VA_BITS."
-#endif
-
-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
-#define S2_PGDIR_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
-#define S2_PGDIR_SIZE			(_AC(1, UL) << S2_PGDIR_SHIFT)
-#define S2_PGDIR_MASK			(~(S2_PGDIR_SIZE - 1))
+/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */
+#define stage2_pgdir_shift(kvm)		pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
+#define stage2_pgdir_size(kvm)		(1ULL << stage2_pgdir_shift(kvm))
+#define stage2_pgdir_mask(kvm)		~(stage2_pgdir_size(kvm) - 1)
 
 /*
  * The number of PTRS across all concatenated stage2 tables given by the
  * number of bits resolved at the initial level.
+ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
+ * in which case, stage2_pgd_ptrs will have one entry.
  */
-#define PTRS_PER_S2_PGD			(1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
+#define pgd_ptrs_shift(ipa, pgdir_shift)	\
+	((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
+#define __s2_pgd_ptrs(ipa, lvls)		\
+	(1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
+#define __s2_pgd_size(ipa, lvls)	(__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
+
+#define stage2_pgd_ptrs(kvm)		__s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
+#define stage2_pgd_size(kvm)		__s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
 
 /*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD.
+ * kvm_mmmu_cache_min_pages() is the number of pages required to install
+ * a stage-2 translation. We pre-allocate the entry level page table at
+ * the VM creation.
  */
-#define KVM_MMU_CACHE_MIN_PAGES		(STAGE2_PGTABLE_LEVELS - 1)
+#define kvm_mmu_cache_min_pages(kvm)	(kvm_stage2_levels(kvm) - 1)
 
-
-#if STAGE2_PGTABLE_LEVELS > 3
+/* Stage2 PUD definitions when the level is present */
+static inline bool kvm_stage2_has_pud(struct kvm *kvm)
+{
+	return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
+}
 
 #define S2_PUD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE			(_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_SIZE			(1UL << S2_PUD_SHIFT)
 #define S2_PUD_MASK			(~(S2_PUD_SIZE - 1))
 
-#define stage2_pgd_none(pgd)				pgd_none(pgd)
-#define stage2_pgd_clear(pgd)				pgd_clear(pgd)
-#define stage2_pgd_present(pgd)				pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud)			pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address)			pud_offset(pgd, address)
-#define stage2_pud_free(pud)				pud_free(NULL, pud)
+static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
+{
+	if (kvm_stage2_has_pud(kvm))
+		return pgd_none(pgd);
+	else
+		return 0;
+}
 
-#define stage2_pud_table_empty(pudp)			kvm_page_empty(pudp)
+static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
+{
+	if (kvm_stage2_has_pud(kvm))
+		pgd_clear(pgdp);
+}
 
-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
 {
-	phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+	if (kvm_stage2_has_pud(kvm))
+		return pgd_present(pgd);
+	else
+		return 1;
+}
 
-	return (boundary - 1 < end - 1) ? boundary : end;
+static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
+{
+	if (kvm_stage2_has_pud(kvm))
+		pgd_populate(NULL, pgd, pud);
+}
+
+static inline pud_t *stage2_pud_offset(struct kvm *kvm,
+				       pgd_t *pgd, unsigned long address)
+{
+	if (kvm_stage2_has_pud(kvm))
+		return pud_offset(pgd, address);
+	else
+		return (pud_t *)pgd;
 }
 
-#endif		/* STAGE2_PGTABLE_LEVELS > 3 */
+static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
+{
+	if (kvm_stage2_has_pud(kvm))
+		pud_free(NULL, pud);
+}
 
+static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
+{
+	if (kvm_stage2_has_pud(kvm))
+		return kvm_page_empty(pudp);
+	else
+		return false;
+}
 
-#if STAGE2_PGTABLE_LEVELS > 2
+static inline phys_addr_t
+stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+	if (kvm_stage2_has_pud(kvm)) {
+		phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+
+		return (boundary - 1 < end - 1) ? boundary : end;
+	} else {
+		return end;
+	}
+}
+
+/* Stage2 PMD definitions when the level is present */
+static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
+{
+	return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
+}
 
 #define S2_PMD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE			(_AC(1, UL) << S2_PMD_SHIFT)
+#define S2_PMD_SIZE			(1UL << S2_PMD_SHIFT)
 #define S2_PMD_MASK			(~(S2_PMD_SIZE - 1))
 
-#define stage2_pud_none(pud)				pud_none(pud)
-#define stage2_pud_clear(pud)				pud_clear(pud)
-#define stage2_pud_present(pud)				pud_present(pud)
-#define stage2_pud_populate(pud, pmd)			pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address)			pmd_offset(pud, address)
-#define stage2_pmd_free(pmd)				pmd_free(NULL, pmd)
+static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		return pud_none(pud);
+	else
+		return 0;
+}
+
+static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		pud_clear(pud);
+}
 
-#define stage2_pud_huge(pud)				pud_huge(pud)
-#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
+static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		return pud_present(pud);
+	else
+		return 1;
+}
 
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
 {
-	phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
+	if (kvm_stage2_has_pmd(kvm))
+		pud_populate(NULL, pud, pmd);
+}
 
-	return (boundary - 1 < end - 1) ? boundary : end;
+static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
+				       pud_t *pud, unsigned long address)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		return pmd_offset(pud, address);
+	else
+		return (pmd_t *)pud;
 }
 
-#endif		/* STAGE2_PGTABLE_LEVELS > 2 */
+static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		pmd_free(NULL, pmd);
+}
+
+static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		return pud_huge(pud);
+	else
+		return 0;
+}
+
+static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
+{
+	if (kvm_stage2_has_pmd(kvm))
+		return kvm_page_empty(pmdp);
+	else
+		return 0;
+}
 
-#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+	if (kvm_stage2_has_pmd(kvm)) {
+		phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
 
-#if STAGE2_PGTABLE_LEVELS == 2
-#include <asm/stage2_pgtable-nopmd.h>
-#elif STAGE2_PGTABLE_LEVELS == 3
-#include <asm/stage2_pgtable-nopud.h>
-#endif
+		return (boundary - 1 < end - 1) ? boundary : end;
+	} else {
+		return end;
+	}
+}
 
+static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
+{
+	return kvm_page_empty(ptep);
+}
 
-#define stage2_pgd_index(addr)				(((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
+{
+	return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
+}
 
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
-	phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
+	phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
 
 	return (boundary - 1 < end - 1) ? boundary : end;
 }
diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h
index eab738019707..397c6ccd04e7 100644
--- a/arch/arm64/include/asm/stat.h
+++ b/arch/arm64/include/asm/stat.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_COMPAT
 
-#include <linux/compat_time.h>
+#include <linux/time.h>
 #include <asm/compat.h>
 
 /*
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index c1470931b897..0c909c4a932f 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,7 +20,6 @@
 #ifndef __ASM_SYSREG_H
 #define __ASM_SYSREG_H
 
-#include <asm/compiler.h>
 #include <linux/stringify.h>
 
 /*
@@ -84,13 +83,26 @@
 
 #endif	/* CONFIG_BROKEN_GAS_INST */
 
-#define REG_PSTATE_PAN_IMM		sys_reg(0, 0, 4, 0, 4)
-#define REG_PSTATE_UAO_IMM		sys_reg(0, 0, 4, 0, 3)
+/*
+ * Instructions for modifying PSTATE fields.
+ * As per Arm ARM for v8-A, Section "C.5.1.3 op0 == 0b00, architectural hints,
+ * barriers and CLREX, and PSTATE access", ARM DDI 0487 C.a, system instructions
+ * for accessing PSTATE fields have the following encoding:
+ *	Op0 = 0, CRn = 4
+ *	Op1, Op2 encodes the PSTATE field modified and defines the constraints.
+ *	CRm = Imm4 for the instruction.
+ *	Rt = 0x1f
+ */
+#define pstate_field(op1, op2)		((op1) << Op1_shift | (op2) << Op2_shift)
+#define PSTATE_Imm_shift		CRm_shift
+
+#define PSTATE_PAN			pstate_field(0, 4)
+#define PSTATE_UAO			pstate_field(0, 3)
+#define PSTATE_SSBS			pstate_field(3, 1)
 
-#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM |	\
-				      (!!x)<<8 | 0x1f)
-#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM |	\
-				      (!!x)<<8 | 0x1f)
+#define SET_PSTATE_PAN(x)		__emit_inst(0xd500401f | PSTATE_PAN | ((!!x) << PSTATE_Imm_shift))
+#define SET_PSTATE_UAO(x)		__emit_inst(0xd500401f | PSTATE_UAO | ((!!x) << PSTATE_Imm_shift))
+#define SET_PSTATE_SSBS(x)		__emit_inst(0xd500401f | PSTATE_SSBS | ((!!x) << PSTATE_Imm_shift))
 
 #define SYS_DC_ISW			sys_insn(1, 0, 7, 6, 2)
 #define SYS_DC_CSW			sys_insn(1, 0, 7, 10, 2)
@@ -419,6 +431,7 @@
 #define SYS_ICH_LR15_EL2		__SYS__LR8_EL2(7)
 
 /* Common SCTLR_ELx flags. */
+#define SCTLR_ELx_DSSBS	(1UL << 44)
 #define SCTLR_ELx_EE    (1 << 25)
 #define SCTLR_ELx_IESB	(1 << 21)
 #define SCTLR_ELx_WXN	(1 << 19)
@@ -439,7 +452,7 @@
 			 (1 << 10) | (1 << 13) | (1 << 14) | (1 << 15) | \
 			 (1 << 17) | (1 << 20) | (1 << 24) | (1 << 26) | \
 			 (1 << 27) | (1 << 30) | (1 << 31) | \
-			 (0xffffffffUL << 32))
+			 (0xffffefffUL << 32))
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL2		SCTLR_ELx_EE
@@ -453,7 +466,7 @@
 #define SCTLR_EL2_SET	(SCTLR_ELx_IESB   | ENDIAN_SET_EL2   | SCTLR_EL2_RES1)
 #define SCTLR_EL2_CLEAR	(SCTLR_ELx_M      | SCTLR_ELx_A    | SCTLR_ELx_C   | \
 			 SCTLR_ELx_SA     | SCTLR_ELx_I    | SCTLR_ELx_WXN | \
-			 ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0)
+			 SCTLR_ELx_DSSBS | ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0)
 
 #if (SCTLR_EL2_SET ^ SCTLR_EL2_CLEAR) != 0xffffffffffffffff
 #error "Inconsistent SCTLR_EL2 set/clear bits"
@@ -477,7 +490,7 @@
 			 (1 << 29))
 #define SCTLR_EL1_RES0  ((1 << 6)  | (1 << 10) | (1 << 13) | (1 << 17) | \
 			 (1 << 27) | (1 << 30) | (1 << 31) | \
-			 (0xffffffffUL << 32))
+			 (0xffffefffUL << 32))
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL1		(SCTLR_EL1_E0E | SCTLR_ELx_EE)
@@ -489,12 +502,12 @@
 
 #define SCTLR_EL1_SET	(SCTLR_ELx_M    | SCTLR_ELx_C    | SCTLR_ELx_SA   |\
 			 SCTLR_EL1_SA0  | SCTLR_EL1_SED  | SCTLR_ELx_I    |\
-			 SCTLR_EL1_DZE  | SCTLR_EL1_UCT  | SCTLR_EL1_NTWI |\
+			 SCTLR_EL1_DZE  | SCTLR_EL1_UCT                   |\
 			 SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\
 			 ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
 #define SCTLR_EL1_CLEAR	(SCTLR_ELx_A   | SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD    |\
 			 SCTLR_EL1_UMA | SCTLR_ELx_WXN     | ENDIAN_CLEAR_EL1 |\
-			 SCTLR_EL1_RES0)
+			 SCTLR_ELx_DSSBS | SCTLR_EL1_NTWI  | SCTLR_EL1_RES0)
 
 #if (SCTLR_EL1_SET ^ SCTLR_EL1_CLEAR) != 0xffffffffffffffff
 #error "Inconsistent SCTLR_EL1 set/clear bits"
@@ -544,6 +557,13 @@
 #define ID_AA64PFR0_EL0_64BIT_ONLY	0x1
 #define ID_AA64PFR0_EL0_32BIT_64BIT	0x2
 
+/* id_aa64pfr1 */
+#define ID_AA64PFR1_SSBS_SHIFT		4
+
+#define ID_AA64PFR1_SSBS_PSTATE_NI	0
+#define ID_AA64PFR1_SSBS_PSTATE_ONLY	1
+#define ID_AA64PFR1_SSBS_PSTATE_INSNS	2
+
 /* id_aa64mmfr0 */
 #define ID_AA64MMFR0_TGRAN4_SHIFT	28
 #define ID_AA64MMFR0_TGRAN64_SHIFT	24
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index 28893a0b141d..0e2a0ecaf484 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -33,7 +33,8 @@ void die(const char *msg, struct pt_regs *regs, int err);
 
 struct siginfo;
 void arm64_notify_die(const char *str, struct pt_regs *regs,
-		      struct siginfo *info, int err);
+		      int signo, int sicode, void __user *addr,
+		      int err);
 
 void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int,
 					     struct pt_regs *),
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index a3233167be60..106fdc951b6e 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -22,16 +22,10 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-
-#define tlb_remove_entry(tlb, entry)	tlb_remove_table(tlb, entry)
 static inline void __tlb_remove_table(void *_table)
 {
 	free_page_and_swap_cache((struct page *)_table);
 }
-#else
-#define tlb_remove_entry(tlb, entry)	tlb_remove_page(tlb, entry)
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 
 static void tlb_flush(struct mmu_gather *tlb);
 
@@ -40,36 +34,35 @@ static void tlb_flush(struct mmu_gather *tlb);
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
+	bool last_level = !tlb->freed_tables;
+	unsigned long stride = tlb_get_unmap_size(tlb);
 
 	/*
-	 * The ASID allocator will either invalidate the ASID or mark
-	 * it as used.
+	 * If we're tearing down the address space then we only care about
+	 * invalidating the walk-cache, since the ASID allocator won't
+	 * reallocate our ASID without invalidating the entire TLB.
 	 */
-	if (tlb->fullmm)
+	if (tlb->fullmm) {
+		if (!last_level)
+			flush_tlb_mm(tlb->mm);
 		return;
+	}
 
-	/*
-	 * The intermediate page table levels are already handled by
-	 * the __(pte|pmd|pud)_free_tlb() functions, so last level
-	 * TLBI is sufficient here.
-	 */
-	__flush_tlb_range(&vma, tlb->start, tlb->end, true);
+	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
 }
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 				  unsigned long addr)
 {
-	__flush_tlb_pgtable(tlb->mm, addr);
 	pgtable_page_dtor(pte);
-	tlb_remove_entry(tlb, pte);
+	tlb_remove_table(tlb, pte);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 				  unsigned long addr)
 {
-	__flush_tlb_pgtable(tlb->mm, addr);
-	tlb_remove_entry(tlb, virt_to_page(pmdp));
+	tlb_remove_table(tlb, virt_to_page(pmdp));
 }
 #endif
 
@@ -77,8 +70,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
 				  unsigned long addr)
 {
-	__flush_tlb_pgtable(tlb->mm, addr);
-	tlb_remove_entry(tlb, virt_to_page(pudp));
+	tlb_remove_table(tlb, virt_to_page(pudp));
 }
 #endif
 
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index a4a1901140ee..c3c0387aee18 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -70,43 +70,73 @@
 	})
 
 /*
- *	TLB Management
- *	==============
+ *	TLB Invalidation
+ *	================
  *
- *	The TLB specific code is expected to perform whatever tests it needs
- *	to determine if it should invalidate the TLB for each call.  Start
- *	addresses are inclusive and end addresses are exclusive; it is safe to
- *	round these addresses down.
+ * 	This header file implements the low-level TLB invalidation routines
+ *	(sometimes referred to as "flushing" in the kernel) for arm64.
  *
- *	flush_tlb_all()
+ *	Every invalidation operation uses the following template:
+ *
+ *	DSB ISHST	// Ensure prior page-table updates have completed
+ *	TLBI ...	// Invalidate the TLB
+ *	DSB ISH		// Ensure the TLB invalidation has completed
+ *      if (invalidated kernel mappings)
+ *		ISB	// Discard any instructions fetched from the old mapping
+ *
+ *
+ *	The following functions form part of the "core" TLB invalidation API,
+ *	as documented in Documentation/core-api/cachetlb.rst:
  *
- *		Invalidate the entire TLB.
+ *	flush_tlb_all()
+ *		Invalidate the entire TLB (kernel + user) on all CPUs
  *
  *	flush_tlb_mm(mm)
+ *		Invalidate an entire user address space on all CPUs.
+ *		The 'mm' argument identifies the ASID to invalidate.
+ *
+ *	flush_tlb_range(vma, start, end)
+ *		Invalidate the virtual-address range '[start, end)' on all
+ *		CPUs for the user address space corresponding to 'vma->mm'.
+ *		Note that this operation also invalidates any walk-cache
+ *		entries associated with translations for the specified address
+ *		range.
+ *
+ *	flush_tlb_kernel_range(start, end)
+ *		Same as flush_tlb_range(..., start, end), but applies to
+ * 		kernel mappings rather than a particular user address space.
+ *		Whilst not explicitly documented, this function is used when
+ *		unmapping pages from vmalloc/io space.
+ *
+ *	flush_tlb_page(vma, addr)
+ *		Invalidate a single user mapping for address 'addr' in the
+ *		address space corresponding to 'vma->mm'.  Note that this
+ *		operation only invalidates a single, last-level page-table
+ *		entry and therefore does not affect any walk-caches.
  *
- *		Invalidate all TLB entries in a particular address space.
- *		- mm	- mm_struct describing address space
  *
- *	flush_tlb_range(mm,start,end)
+ *	Next, we have some undocumented invalidation routines that you probably
+ *	don't want to call unless you know what you're doing:
  *
- *		Invalidate a range of TLB entries in the specified address
- *		space.
- *		- mm	- mm_struct describing address space
- *		- start - start address (may not be aligned)
- *		- end	- end address (exclusive, may not be aligned)
+ *	local_flush_tlb_all()
+ *		Same as flush_tlb_all(), but only applies to the calling CPU.
  *
- *	flush_tlb_page(vaddr,vma)
+ *	__flush_tlb_kernel_pgtable(addr)
+ *		Invalidate a single kernel mapping for address 'addr' on all
+ *		CPUs, ensuring that any walk-cache entries associated with the
+ *		translation are also invalidated.
  *
- *		Invalidate the specified page in the specified address range.
- *		- vaddr - virtual address (may not be aligned)
- *		- vma	- vma_struct describing address range
+ *	__flush_tlb_range(vma, start, end, stride, last_level)
+ *		Invalidate the virtual-address range '[start, end)' on all
+ *		CPUs for the user address space corresponding to 'vma->mm'.
+ *		The invalidation operations are issued at a granularity
+ *		determined by 'stride' and only affect any walk-cache entries
+ *		if 'last_level' is equal to false.
  *
- *	flush_kern_tlb_page(kaddr)
  *
- *		Invalidate the TLB entry for the specified page.  The address
- *		will be in the kernels virtual memory space.  Current uses
- *		only require the D-TLB to be invalidated.
- *		- kaddr - Kernel virtual memory address
+ *	Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
+ *	on top of these routines, since that is our interface to the mmu_gather
+ *	API as used by munmap() and friends.
  */
 static inline void local_flush_tlb_all(void)
 {
@@ -149,25 +179,28 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
  * necessarily a performance improvement.
  */
-#define MAX_TLB_RANGE	(1024UL << PAGE_SHIFT)
+#define MAX_TLBI_OPS	1024UL
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
-				     bool last_level)
+				     unsigned long stride, bool last_level)
 {
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
 
-	if ((end - start) > MAX_TLB_RANGE) {
+	if ((end - start) > (MAX_TLBI_OPS * stride)) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
 
+	/* Convert the stride into units of 4k */
+	stride >>= 12;
+
 	start = __TLBI_VADDR(start, asid);
 	end = __TLBI_VADDR(end, asid);
 
 	dsb(ishst);
-	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) {
+	for (addr = start; addr < end; addr += stride) {
 		if (last_level) {
 			__tlbi(vale1is, addr);
 			__tlbi_user(vale1is, addr);
@@ -182,14 +215,18 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {
-	__flush_tlb_range(vma, start, end, false);
+	/*
+	 * We cannot use leaf-only invalidation here, since we may be invalidating
+	 * table entries as part of collapsing hugepages or moving page tables.
+	 */
+	__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	unsigned long addr;
 
-	if ((end - start) > MAX_TLB_RANGE) {
+	if ((end - start) > (MAX_TLBI_OPS * PAGE_SIZE)) {
 		flush_tlb_all();
 		return;
 	}
@@ -199,7 +236,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 
 	dsb(ishst);
 	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
-		__tlbi(vaae1is, addr);
+		__tlbi(vaale1is, addr);
 	dsb(ish);
 	isb();
 }
@@ -208,20 +245,11 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
  * Used to invalidate the TLB (walk caches) corresponding to intermediate page
  * table levels (pgd/pud/pmd).
  */
-static inline void __flush_tlb_pgtable(struct mm_struct *mm,
-				       unsigned long uaddr)
-{
-	unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm));
-
-	__tlbi(vae1is, addr);
-	__tlbi_user(vae1is, addr);
-	dsb(ish);
-}
-
 static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
 {
 	unsigned long addr = __TLBI_VADDR(kaddr, 0);
 
+	dsb(ishst);
 	__tlbi(vaae1is, addr);
 	dsb(ish);
 }
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 49a0fee4f89b..0524f2438649 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus);
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
 
+/* Enable topology flag updates */
+#define arch_update_cpu_topology topology_update_cpu_topology
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index c320f3bf6c57..f9c1aa6167d2 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,8 +37,9 @@ void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 void force_signal_inject(int signal, int code, unsigned long address);
 void arm64_notify_segfault(unsigned long addr);
-void arm64_force_sig_info(struct siginfo *info, const char *str,
-			  struct task_struct *tsk);
+void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str);
+void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str);
+void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str);
 
 /*
  * Move regs->pc to next instruction and do necessary setup before it
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index e66b0fca99c2..07c34087bd5e 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -32,7 +32,6 @@
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
 #include <asm/memory.h>
-#include <asm/compiler.h>
 #include <asm/extable.h>
 
 #define get_ds()	(KERNEL_DS)
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index e0d0f5b856e7..b13ca091f833 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -18,11 +18,11 @@
 #define __ARCH_WANT_SYS_GETHOSTNAME
 #define __ARCH_WANT_SYS_PAUSE
 #define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
 #define __ARCH_WANT_SYS_NICE
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
+#define __ARCH_WANT_SYS_UTIME32
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
 
diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h
index 4e22b7a8c038..2788e95d0ff0 100644
--- a/arch/arm64/include/asm/xen/events.h
+++ b/arch/arm64/include/asm/xen/events.h
@@ -14,7 +14,7 @@ enum ipi_vector {
 
 static inline int xen_irqs_disabled(struct pt_regs *regs)
 {
-	return raw_irqs_disabled_flags((unsigned long) regs->pstate);
+	return !interrupts_enabled(regs);
 }
 
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild
index 198afbf0688f..6c5adf458690 100644
--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -19,3 +19,4 @@ generic-y += swab.h
 generic-y += termbits.h
 generic-y += termios.h
 generic-y += types.h
+generic-y += siginfo.h
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 17c65c8f33cb..2bcd6e4f3474 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -48,5 +48,6 @@
 #define HWCAP_USCAT		(1 << 25)
 #define HWCAP_ILRCPC		(1 << 26)
 #define HWCAP_FLAGM		(1 << 27)
+#define HWCAP_SSBS		(1 << 28)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 98c4ce55d9c3..a36227fdb084 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -46,6 +46,7 @@
 #define PSR_I_BIT	0x00000080
 #define PSR_A_BIT	0x00000100
 #define PSR_D_BIT	0x00000200
+#define PSR_SSBS_BIT	0x00001000
 #define PSR_PAN_BIT	0x00400000
 #define PSR_UAO_BIT	0x00800000
 #define PSR_V_BIT	0x10000000
diff --git a/arch/arm64/include/uapi/asm/siginfo.h b/arch/arm64/include/uapi/asm/siginfo.h
deleted file mode 100644
index 574d12f86039..000000000000
--- a/arch/arm64/include/uapi/asm/siginfo.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_SIGINFO_H
-#define __ASM_SIGINFO_H
-
-#define __ARCH_SI_PREAMBLE_SIZE	(4 * sizeof(int))
-
-#include <asm-generic/siginfo.h>
-
-#endif
diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index 5072cbd15c82..dae1584cf017 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -16,5 +16,6 @@
  */
 
 #define __ARCH_WANT_RENAMEAT
+#define __ARCH_WANT_NEW_STAT
 
 #include <asm-generic/unistd.h>
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index dec10898d688..a509e35132d2 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -68,21 +68,43 @@ static bool
 has_mismatched_cache_type(const struct arm64_cpu_capabilities *entry,
 			  int scope)
 {
-	u64 mask = CTR_CACHE_MINLINE_MASK;
-
-	/* Skip matching the min line sizes for cache type check */
-	if (entry->capability == ARM64_MISMATCHED_CACHE_TYPE)
-		mask ^= arm64_ftr_reg_ctrel0.strict_mask;
+	u64 mask = arm64_ftr_reg_ctrel0.strict_mask;
+	u64 sys = arm64_ftr_reg_ctrel0.sys_val & mask;
+	u64 ctr_raw, ctr_real;
 
 	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-	return (read_cpuid_cachetype() & mask) !=
-	       (arm64_ftr_reg_ctrel0.sys_val & mask);
+
+	/*
+	 * We want to make sure that all the CPUs in the system expose
+	 * a consistent CTR_EL0 to make sure that applications behaves
+	 * correctly with migration.
+	 *
+	 * If a CPU has CTR_EL0.IDC but does not advertise it via CTR_EL0 :
+	 *
+	 * 1) It is safe if the system doesn't support IDC, as CPU anyway
+	 *    reports IDC = 0, consistent with the rest.
+	 *
+	 * 2) If the system has IDC, it is still safe as we trap CTR_EL0
+	 *    access on this CPU via the ARM64_HAS_CACHE_IDC capability.
+	 *
+	 * So, we need to make sure either the raw CTR_EL0 or the effective
+	 * CTR_EL0 matches the system's copy to allow a secondary CPU to boot.
+	 */
+	ctr_raw = read_cpuid_cachetype() & mask;
+	ctr_real = read_cpuid_effective_cachetype() & mask;
+
+	return (ctr_real != sys) && (ctr_raw != sys);
 }
 
 static void
 cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused)
 {
-	sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
+	u64 mask = arm64_ftr_reg_ctrel0.strict_mask;
+
+	/* Trap CTR_EL0 access on this CPU, only if it has a mismatch */
+	if ((read_cpuid_cachetype() & mask) !=
+	    (arm64_ftr_reg_ctrel0.sys_val & mask))
+		sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
 }
 
 atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
@@ -116,6 +138,15 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
 	static DEFINE_SPINLOCK(bp_lock);
 	int cpu, slot = -1;
 
+	/*
+	 * enable_smccc_arch_workaround_1() passes NULL for the hyp_vecs
+	 * start/end if we're a guest. Skip the hyp-vectors work.
+	 */
+	if (!hyp_vecs_start) {
+		__this_cpu_write(bp_hardening_data.fn, fn);
+		return;
+	}
+
 	spin_lock(&bp_lock);
 	for_each_possible_cpu(cpu) {
 		if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
@@ -312,6 +343,14 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt,
 
 void arm64_set_ssbd_mitigation(bool state)
 {
+	if (this_cpu_has_cap(ARM64_SSBS)) {
+		if (state)
+			asm volatile(SET_PSTATE_SSBS(0));
+		else
+			asm volatile(SET_PSTATE_SSBS(1));
+		return;
+	}
+
 	switch (psci_ops.conduit) {
 	case PSCI_CONDUIT_HVC:
 		arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
@@ -336,6 +375,11 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
 
 	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
 
+	if (this_cpu_has_cap(ARM64_SSBS)) {
+		required = false;
+		goto out_printmsg;
+	}
+
 	if (psci_ops.smccc_version == SMCCC_VERSION_1_0) {
 		ssbd_state = ARM64_SSBD_UNKNOWN;
 		return false;
@@ -384,7 +428,6 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
 
 	switch (ssbd_state) {
 	case ARM64_SSBD_FORCE_DISABLE:
-		pr_info_once("%s disabled from command-line\n", entry->desc);
 		arm64_set_ssbd_mitigation(false);
 		required = false;
 		break;
@@ -397,7 +440,6 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
 		break;
 
 	case ARM64_SSBD_FORCE_ENABLE:
-		pr_info_once("%s forced from command-line\n", entry->desc);
 		arm64_set_ssbd_mitigation(true);
 		required = true;
 		break;
@@ -407,10 +449,27 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
 		break;
 	}
 
+out_printmsg:
+	switch (ssbd_state) {
+	case ARM64_SSBD_FORCE_DISABLE:
+		pr_info_once("%s disabled from command-line\n", entry->desc);
+		break;
+
+	case ARM64_SSBD_FORCE_ENABLE:
+		pr_info_once("%s forced from command-line\n", entry->desc);
+		break;
+	}
+
 	return required;
 }
 #endif	/* CONFIG_ARM64_SSBD */
 
+static void __maybe_unused
+cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
+{
+	sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0);
+}
+
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
@@ -616,14 +675,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	},
 #endif
 	{
-		.desc = "Mismatched cache line size",
-		.capability = ARM64_MISMATCHED_CACHE_LINE_SIZE,
-		.matches = has_mismatched_cache_type,
-		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
-		.cpu_enable = cpu_enable_trap_ctr_access,
-	},
-	{
-		.desc = "Mismatched cache type",
+		.desc = "Mismatched cache type (CTR_EL0)",
 		.capability = ARM64_MISMATCHED_CACHE_TYPE,
 		.matches = has_mismatched_cache_type,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
@@ -680,6 +732,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.matches = has_ssbd_mitigation,
 	},
 #endif
+#ifdef CONFIG_ARM64_ERRATUM_1188873
+	{
+		/* Cortex-A76 r0p0 to r2p0 */
+		.desc = "ARM erratum 1188873",
+		.capability = ARM64_WORKAROUND_1188873,
+		ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0),
+	},
+#endif
 	{
 	}
 };
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e238b7932096..af50064dea51 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -20,6 +20,7 @@
 
 #include <linux/bsearch.h>
 #include <linux/cpumask.h>
+#include <linux/crash_dump.h>
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
 #include <linux/types.h>
@@ -117,6 +118,7 @@ EXPORT_SYMBOL(cpu_hwcap_keys);
 static bool __maybe_unused
 cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused);
 
+static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap);
 
 /*
  * NOTE: Any changes to the visibility of features should be kept in
@@ -164,6 +166,11 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI),
 	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI),
@@ -371,7 +378,7 @@ static const struct __ftr_reg_entry {
 
 	/* Op1 = 0, CRn = 0, CRm = 4 */
 	ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0),
-	ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_raz),
+	ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1),
 	ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_raz),
 
 	/* Op1 = 0, CRn = 0, CRm = 5 */
@@ -657,7 +664,6 @@ void update_cpu_features(int cpu,
 
 	/*
 	 * EL3 is not our concern.
-	 * ID_AA64PFR1 is currently RES0.
 	 */
 	taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu,
 				      info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0);
@@ -848,15 +854,55 @@ static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unus
 }
 
 static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
-			  int __unused)
+			  int scope)
 {
-	return read_sanitised_ftr_reg(SYS_CTR_EL0) & BIT(CTR_IDC_SHIFT);
+	u64 ctr;
+
+	if (scope == SCOPE_SYSTEM)
+		ctr = arm64_ftr_reg_ctrel0.sys_val;
+	else
+		ctr = read_cpuid_effective_cachetype();
+
+	return ctr & BIT(CTR_IDC_SHIFT);
+}
+
+static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused)
+{
+	/*
+	 * If the CPU exposes raw CTR_EL0.IDC = 0, while effectively
+	 * CTR_EL0.IDC = 1 (from CLIDR values), we need to trap accesses
+	 * to the CTR_EL0 on this CPU and emulate it with the real/safe
+	 * value.
+	 */
+	if (!(read_cpuid_cachetype() & BIT(CTR_IDC_SHIFT)))
+		sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
 }
 
 static bool has_cache_dic(const struct arm64_cpu_capabilities *entry,
-			  int __unused)
+			  int scope)
 {
-	return read_sanitised_ftr_reg(SYS_CTR_EL0) & BIT(CTR_DIC_SHIFT);
+	u64 ctr;
+
+	if (scope == SCOPE_SYSTEM)
+		ctr = arm64_ftr_reg_ctrel0.sys_val;
+	else
+		ctr = read_cpuid_cachetype();
+
+	return ctr & BIT(CTR_DIC_SHIFT);
+}
+
+static bool __maybe_unused
+has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	/*
+	 * Kdump isn't guaranteed to power-off all secondary CPUs, CNP
+	 * may share TLB entries with a CPU stuck in the crashed
+	 * kernel.
+	 */
+	 if (is_kdump_kernel())
+		return false;
+
+	return has_cpuid_feature(entry, scope);
 }
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
@@ -1035,6 +1081,70 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
 	WARN_ON(val & (7 << 27 | 7 << 21));
 }
 
+#ifdef CONFIG_ARM64_SSBD
+static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
+{
+	if (user_mode(regs))
+		return 1;
+
+	if (instr & BIT(PSTATE_Imm_shift))
+		regs->pstate |= PSR_SSBS_BIT;
+	else
+		regs->pstate &= ~PSR_SSBS_BIT;
+
+	arm64_skip_faulting_instruction(regs, 4);
+	return 0;
+}
+
+static struct undef_hook ssbs_emulation_hook = {
+	.instr_mask	= ~(1U << PSTATE_Imm_shift),
+	.instr_val	= 0xd500401f | PSTATE_SSBS,
+	.fn		= ssbs_emulation_handler,
+};
+
+static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
+{
+	static bool undef_hook_registered = false;
+	static DEFINE_SPINLOCK(hook_lock);
+
+	spin_lock(&hook_lock);
+	if (!undef_hook_registered) {
+		register_undef_hook(&ssbs_emulation_hook);
+		undef_hook_registered = true;
+	}
+	spin_unlock(&hook_lock);
+
+	if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+		sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
+		arm64_set_ssbd_mitigation(false);
+	} else {
+		arm64_set_ssbd_mitigation(true);
+	}
+}
+#endif /* CONFIG_ARM64_SSBD */
+
+#ifdef CONFIG_ARM64_PAN
+static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
+{
+	/*
+	 * We modify PSTATE. This won't work from irq context as the PSTATE
+	 * is discarded once we return from the exception.
+	 */
+	WARN_ON_ONCE(in_interrupt());
+
+	sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0);
+	asm(SET_PSTATE_PAN(1));
+}
+#endif /* CONFIG_ARM64_PAN */
+
+#ifdef CONFIG_ARM64_RAS_EXTN
+static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
+{
+	/* Firmware may have left a deferred SError in this register. */
+	write_sysreg_s(0, SYS_DISR_EL1);
+}
+#endif /* CONFIG_ARM64_RAS_EXTN */
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -1184,6 +1294,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.capability = ARM64_HAS_CACHE_IDC,
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.matches = has_cache_idc,
+		.cpu_enable = cpu_emulate_effective_ctr,
 	},
 	{
 		.desc = "Instruction cache invalidation not required for I/D coherence",
@@ -1222,6 +1333,41 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_hw_dbm,
 	},
 #endif
+#ifdef CONFIG_ARM64_SSBD
+	{
+		.desc = "CRC32 instructions",
+		.capability = ARM64_HAS_CRC32,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64ISAR0_EL1,
+		.field_pos = ID_AA64ISAR0_CRC32_SHIFT,
+		.min_field_value = 1,
+	},
+	{
+		.desc = "Speculative Store Bypassing Safe (SSBS)",
+		.capability = ARM64_SSBS,
+		.type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64PFR1_EL1,
+		.field_pos = ID_AA64PFR1_SSBS_SHIFT,
+		.sign = FTR_UNSIGNED,
+		.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
+		.cpu_enable = cpu_enable_ssbs,
+	},
+#endif
+#ifdef CONFIG_ARM64_CNP
+	{
+		.desc = "Common not Private translations",
+		.capability = ARM64_HAS_CNP,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_useable_cnp,
+		.sys_reg = SYS_ID_AA64MMFR2_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64MMFR2_CNP_SHIFT,
+		.min_field_value = 1,
+		.cpu_enable = cpu_enable_cnp,
+	},
+#endif
 	{},
 };
 
@@ -1267,6 +1413,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 #ifdef CONFIG_ARM64_SVE
 	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, HWCAP_SVE),
 #endif
+	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS),
 	{},
 };
 
@@ -1658,6 +1805,11 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused)
 	return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO));
 }
 
+static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
+{
+	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+}
+
 /*
  * We emulate only the following system register space.
  * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7]
@@ -1719,27 +1871,32 @@ static int emulate_sys_reg(u32 id, u64 *valp)
 	return 0;
 }
 
-static int emulate_mrs(struct pt_regs *regs, u32 insn)
+int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt)
 {
 	int rc;
-	u32 sys_reg, dst;
 	u64 val;
 
-	/*
-	 * sys_reg values are defined as used in mrs/msr instruction.
-	 * shift the imm value to get the encoding.
-	 */
-	sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5;
 	rc = emulate_sys_reg(sys_reg, &val);
 	if (!rc) {
-		dst = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
-		pt_regs_write_reg(regs, dst, val);
+		pt_regs_write_reg(regs, rt, val);
 		arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
 	}
-
 	return rc;
 }
 
+static int emulate_mrs(struct pt_regs *regs, u32 insn)
+{
+	u32 sys_reg, rt;
+
+	/*
+	 * sys_reg values are defined as used in mrs/msr instruction.
+	 * shift the imm value to get the encoding.
+	 */
+	sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5;
+	rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
+	return do_emulate_mrs(regs, sys_reg, rt);
+}
+
 static struct undef_hook mrs_hook = {
 	.instr_mask = 0xfff00000,
 	.instr_val  = 0xd5300000,
@@ -1755,9 +1912,3 @@ static int __init enable_mrs_emulation(void)
 }
 
 core_initcall(enable_mrs_emulation);
-
-void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
-{
-	/* Firmware may have left a deferred SError in this register. */
-	write_sysreg_s(0, SYS_DISR_EL1);
-}
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index e9ab7b3ed317..bcc2831399cb 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -81,6 +81,7 @@ static const char *const hwcap_str[] = {
 	"uscat",
 	"ilrcpc",
 	"flagm",
+	"ssbs",
 	NULL
 };
 
@@ -324,7 +325,15 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
 static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 {
 	info->reg_cntfrq = arch_timer_get_cntfrq();
-	info->reg_ctr = read_cpuid_cachetype();
+	/*
+	 * Use the effective value of the CTR_EL0 than the raw value
+	 * exposed by the CPU. CTR_E0.IDC field value must be interpreted
+	 * with the CLIDR_EL1 fields to avoid triggering false warnings
+	 * when there is a mismatch across the CPUs. Keep track of the
+	 * effective value of the CTR_EL0 in our internal records for
+	 * acurate sanity check and feature enablement.
+	 */
+	info->reg_ctr = read_cpuid_effective_cachetype();
 	info->reg_dczid = read_cpuid(DCZID_EL0);
 	info->reg_midr = read_cpuid_id();
 	info->reg_revidr = read_cpuid(REVIDR_EL1);
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 06ca574495af..d7bb6aefae0a 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -210,13 +210,6 @@ NOKPROBE_SYMBOL(call_step_hook);
 static void send_user_sigtrap(int si_code)
 {
 	struct pt_regs *regs = current_pt_regs();
-	siginfo_t info;
-
-	clear_siginfo(&info);
-	info.si_signo	= SIGTRAP;
-	info.si_errno	= 0;
-	info.si_code	= si_code;
-	info.si_addr	= (void __user *)instruction_pointer(regs);
 
 	if (WARN_ON(!user_mode(regs)))
 		return;
@@ -224,7 +217,9 @@ static void send_user_sigtrap(int si_code)
 	if (interrupts_enabled(regs))
 		local_irq_enable();
 
-	arm64_force_sig_info(&info, "User debug trap", current);
+	arm64_force_sig_fault(SIGTRAP, si_code,
+			     (void __user *)instruction_pointer(regs),
+			     "User debug trap");
 }
 
 static int single_step_handler(unsigned long addr, unsigned int esr,
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 09dbea221a27..039144ecbcb2 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -589,7 +589,7 @@ el1_undef:
 	inherit_daif	pstate=x23, tmp=x2
 	mov	x0, sp
 	bl	do_undefinstr
-	ASM_BUG()
+	kernel_exit 1
 el1_dbg:
 	/*
 	 * Debug exception handling
@@ -665,6 +665,7 @@ el0_sync:
 	cmp	x24, #ESR_ELx_EC_FP_EXC64	// FP/ASIMD exception
 	b.eq	el0_fpsimd_exc
 	cmp	x24, #ESR_ELx_EC_SYS64		// configurable trap
+	ccmp	x24, #ESR_ELx_EC_WFx, #4, ne
 	b.eq	el0_sys
 	cmp	x24, #ESR_ELx_EC_SP_ALIGN	// stack alignment exception
 	b.eq	el0_sp_pc
@@ -697,9 +698,9 @@ el0_sync_compat:
 	cmp	x24, #ESR_ELx_EC_UNKNOWN	// unknown exception in EL0
 	b.eq	el0_undef
 	cmp	x24, #ESR_ELx_EC_CP15_32	// CP15 MRC/MCR trap
-	b.eq	el0_undef
+	b.eq	el0_cp15
 	cmp	x24, #ESR_ELx_EC_CP15_64	// CP15 MRRC/MCRR trap
-	b.eq	el0_undef
+	b.eq	el0_cp15
 	cmp	x24, #ESR_ELx_EC_CP14_MR	// CP14 MRC/MCR trap
 	b.eq	el0_undef
 	cmp	x24, #ESR_ELx_EC_CP14_LS	// CP14 LDC/STC trap
@@ -722,6 +723,17 @@ el0_irq_compat:
 el0_error_compat:
 	kernel_entry 0, 32
 	b	el0_error_naked
+
+el0_cp15:
+	/*
+	 * Trapped CP15 (MRC, MCR, MRRC, MCRR) instructions
+	 */
+	enable_daif
+	ct_user_exit
+	mov	x0, x25
+	mov	x1, sp
+	bl	do_cp15instr
+	b	ret_to_user
 #endif
 
 el0_da:
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 58c53bc96928..5ebe73b69961 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -842,7 +842,6 @@ asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
  */
 asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 {
-	siginfo_t info;
 	unsigned int si_code = FPE_FLTUNK;
 
 	if (esr & ESR_ELx_FP_EXC_TFV) {
@@ -858,12 +857,9 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 			si_code = FPE_FLTRES;
 	}
 
-	clear_siginfo(&info);
-	info.si_signo = SIGFPE;
-	info.si_code = si_code;
-	info.si_addr = (void __user *)instruction_pointer(regs);
-
-	send_sig_info(SIGFPE, &info, current);
+	send_sig_fault(SIGFPE, si_code,
+		       (void __user *)instruction_pointer(regs),
+		       current);
 }
 
 void fpsimd_thread_switch(struct task_struct *next)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b0853069702f..4471f570a295 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -287,19 +287,21 @@ __create_page_tables:
 	mov	x28, lr
 
 	/*
-	 * Invalidate the idmap and swapper page tables to avoid potential
-	 * dirty cache lines being evicted.
+	 * Invalidate the init page tables to avoid potential dirty cache lines
+	 * being evicted. Other page tables are allocated in rodata as part of
+	 * the kernel image, and thus are clean to the PoC per the boot
+	 * protocol.
 	 */
-	adrp	x0, idmap_pg_dir
-	adrp	x1, swapper_pg_end
+	adrp	x0, init_pg_dir
+	adrp	x1, init_pg_end
 	sub	x1, x1, x0
 	bl	__inval_dcache_area
 
 	/*
-	 * Clear the idmap and swapper page tables.
+	 * Clear the init page tables.
 	 */
-	adrp	x0, idmap_pg_dir
-	adrp	x1, swapper_pg_end
+	adrp	x0, init_pg_dir
+	adrp	x1, init_pg_end
 	sub	x1, x1, x0
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -373,7 +375,7 @@ __create_page_tables:
 	/*
 	 * Map the kernel image (starting with PHYS_OFFSET).
 	 */
-	adrp	x0, swapper_pg_dir
+	adrp	x0, init_pg_dir
 	mov_q	x5, KIMAGE_VADDR + TEXT_OFFSET	// compile time __va(_text)
 	add	x5, x5, x23			// add KASLR displacement
 	mov	x4, PTRS_PER_PGD
@@ -390,7 +392,7 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	adrp	x0, idmap_pg_dir
-	adrp	x1, swapper_pg_end
+	adrp	x1, init_pg_end
 	sub	x1, x1, x0
 	dmb	sy
 	bl	__inval_dcache_area
@@ -706,6 +708,7 @@ secondary_startup:
 	 * Common entry point for secondary CPUs.
 	 */
 	bl	__cpu_setup			// initialise processor
+	adrp	x1, swapper_pg_dir
 	bl	__enable_mmu
 	ldr	x8, =__secondary_switched
 	br	x8
@@ -748,6 +751,7 @@ ENDPROC(__secondary_switched)
  * Enable the MMU.
  *
  *  x0  = SCTLR_EL1 value for turning on the MMU.
+ *  x1  = TTBR1_EL1 value
  *
  * Returns to the caller via x30/lr. This requires the caller to be covered
  * by the .idmap.text section.
@@ -756,17 +760,16 @@ ENDPROC(__secondary_switched)
  * If it isn't, park the CPU
  */
 ENTRY(__enable_mmu)
-	mrs	x1, ID_AA64MMFR0_EL1
-	ubfx	x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
+	mrs	x2, ID_AA64MMFR0_EL1
+	ubfx	x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
 	cmp	x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
 	b.ne	__no_granule_support
-	update_early_cpu_boot_status 0, x1, x2
-	adrp	x1, idmap_pg_dir
-	adrp	x2, swapper_pg_dir
-	phys_to_ttbr x3, x1
-	phys_to_ttbr x4, x2
-	msr	ttbr0_el1, x3			// load TTBR0
-	msr	ttbr1_el1, x4			// load TTBR1
+	update_early_cpu_boot_status 0, x2, x3
+	adrp	x2, idmap_pg_dir
+	phys_to_ttbr x1, x1
+	phys_to_ttbr x2, x2
+	msr	ttbr0_el1, x2			// load TTBR0
+	msr	ttbr1_el1, x1			// load TTBR1
 	isb
 	msr	sctlr_el1, x0
 	isb
@@ -823,6 +826,7 @@ __primary_switch:
 	mrs	x20, sctlr_el1			// preserve old SCTLR_EL1 value
 #endif
 
+	adrp	x1, init_pg_dir
 	bl	__enable_mmu
 #ifdef CONFIG_RELOCATABLE
 	bl	__relocate_kernel
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index e0756416e567..646b9562ee64 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -25,12 +25,12 @@
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
-	void *addr = (void *)entry->code;
+	void *addr = (void *)jump_entry_code(entry);
 	u32 insn;
 
 	if (type == JUMP_LABEL_JMP) {
-		insn = aarch64_insn_gen_branch_imm(entry->code,
-						   entry->target,
+		insn = aarch64_insn_gen_branch_imm(jump_entry_code(entry),
+						   jump_entry_target(entry),
 						   AARCH64_INSN_BRANCH_NOLINK);
 	} else {
 		insn = aarch64_insn_gen_nop();
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 53f371ed4568..75c158b0353f 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -21,5 +21,5 @@
 struct static_key paravirt_steal_enabled;
 struct static_key paravirt_steal_rq_enabled;
 
-struct pv_time_ops pv_time_ops;
-EXPORT_SYMBOL_GPL(pv_time_ops);
+struct paravirt_patch_template pv_ops;
+EXPORT_SYMBOL_GPL(pv_ops);
diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 0e2ea1c78542..bb85e2f4603f 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -165,16 +165,15 @@ static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci)
 /* Interface called from ACPI code to setup PCI host controller */
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 {
-	int node = acpi_get_node(root->device->handle);
 	struct acpi_pci_generic_root_info *ri;
 	struct pci_bus *bus, *child;
 	struct acpi_pci_root_ops *root_ops;
 
-	ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
+	ri = kzalloc(sizeof(*ri), GFP_KERNEL);
 	if (!ri)
 		return NULL;
 
-	root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node);
+	root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL);
 	if (!root_ops) {
 		kfree(ri);
 		return NULL;
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 8e38d5267f22..e213f8e867f6 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -966,6 +966,12 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
 	return 0;
 }
 
+static int armv8pmu_filter_match(struct perf_event *event)
+{
+	unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
+	return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
+}
+
 static void armv8pmu_reset(void *info)
 {
 	struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
@@ -1114,6 +1120,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->stop			= armv8pmu_stop,
 	cpu_pmu->reset			= armv8pmu_reset,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
+	cpu_pmu->filter_match		= armv8pmu_filter_match;
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index e78c3ef04d95..9b65132e789a 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -107,7 +107,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 		if (!p->ainsn.api.insn)
 			return -ENOMEM;
 		break;
-	};
+	}
 
 	/* prepare the instruction */
 	if (p->ainsn.api.insn)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7f1628effe6d..ce99c58cd1f1 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -358,6 +358,10 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		if (IS_ENABLED(CONFIG_ARM64_UAO) &&
 		    cpus_have_const_cap(ARM64_HAS_UAO))
 			childregs->pstate |= PSR_UAO_BIT;
+
+		if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
+			childregs->pstate |= PSR_SSBS_BIT;
+
 		p->thread.cpu_context.x19 = stack_start;
 		p->thread.cpu_context.x20 = stk_sz;
 	}
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index e8edbf13302a..8cdaf25e99cd 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -24,7 +24,6 @@
 
 #include <uapi/linux/psci.h>
 
-#include <asm/compiler.h>
 #include <asm/cpu_ops.h>
 #include <asm/errno.h>
 #include <asm/smp_plat.h>
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 6219486fa25f..1710a2d01669 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -182,13 +182,7 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 				struct pt_regs *regs)
 {
 	struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp);
-	siginfo_t info;
-
-	clear_siginfo(&info);
-	info.si_signo	= SIGTRAP;
-	info.si_errno	= 0;
-	info.si_code	= TRAP_HWBKPT;
-	info.si_addr	= (void __user *)(bkpt->trigger);
+	const char *desc = "Hardware breakpoint trap (ptrace)";
 
 #ifdef CONFIG_COMPAT
 	if (is_compat_task()) {
@@ -208,10 +202,14 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 				break;
 			}
 		}
-		force_sig_ptrace_errno_trap(si_errno, (void __user *)bkpt->trigger);
+		arm64_force_sig_ptrace_errno_trap(si_errno,
+						  (void __user *)bkpt->trigger,
+						  desc);
 	}
 #endif
-	arm64_force_sig_info(&info, "Hardware breakpoint trap (ptrace)", current);
+	arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT,
+			      (void __user *)(bkpt->trigger),
+			      desc);
 }
 
 /*
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 5b4fac434c84..d0f62dd24c90 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -64,6 +64,9 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/mmu_context.h>
 
+static int num_standard_resources;
+static struct resource *standard_resources;
+
 phys_addr_t __fdt_pointer __initdata;
 
 /*
@@ -206,14 +209,19 @@ static void __init request_standard_resources(void)
 {
 	struct memblock_region *region;
 	struct resource *res;
+	unsigned long i = 0;
 
 	kernel_code.start   = __pa_symbol(_text);
 	kernel_code.end     = __pa_symbol(__init_begin - 1);
 	kernel_data.start   = __pa_symbol(_sdata);
 	kernel_data.end     = __pa_symbol(_end - 1);
 
+	num_standard_resources = memblock.memory.cnt;
+	standard_resources = alloc_bootmem_low(num_standard_resources *
+					       sizeof(*standard_resources));
+
 	for_each_memblock(memory, region) {
-		res = alloc_bootmem_low(sizeof(*res));
+		res = &standard_resources[i++];
 		if (memblock_is_nomap(region)) {
 			res->name  = "reserved";
 			res->flags = IORESOURCE_MEM;
@@ -243,36 +251,26 @@ static void __init request_standard_resources(void)
 
 static int __init reserve_memblock_reserved_regions(void)
 {
-	phys_addr_t start, end, roundup_end = 0;
-	struct resource *mem, *res;
-	u64 i;
-
-	for_each_reserved_mem_region(i, &start, &end) {
-		if (end <= roundup_end)
-			continue; /* done already */
-
-		start = __pfn_to_phys(PFN_DOWN(start));
-		end = __pfn_to_phys(PFN_UP(end)) - 1;
-		roundup_end = end;
-
-		res = kzalloc(sizeof(*res), GFP_ATOMIC);
-		if (WARN_ON(!res))
-			return -ENOMEM;
-		res->start = start;
-		res->end = end;
-		res->name  = "reserved";
-		res->flags = IORESOURCE_MEM;
-
-		mem = request_resource_conflict(&iomem_resource, res);
-		/*
-		 * We expected memblock_reserve() regions to conflict with
-		 * memory created by request_standard_resources().
-		 */
-		if (WARN_ON_ONCE(!mem))
+	u64 i, j;
+
+	for (i = 0; i < num_standard_resources; ++i) {
+		struct resource *mem = &standard_resources[i];
+		phys_addr_t r_start, r_end, mem_size = resource_size(mem);
+
+		if (!memblock_is_region_reserved(mem->start, mem_size))
 			continue;
-		kfree(res);
 
-		reserve_region_with_split(mem, start, end, "reserved");
+		for_each_reserved_mem_region(j, &r_start, &r_end) {
+			resource_size_t start, end;
+
+			start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start);
+			end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end);
+
+			if (start > mem->end || end < mem->start)
+				continue;
+
+			reserve_region_with_split(mem, start, end, "reserved");
+		}
 	}
 
 	return 0;
@@ -351,12 +349,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
 	conswitchp = &dummy_con;
 #endif
-#endif
 	if (boot_args[1] || boot_args[2] || boot_args[3]) {
 		pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
 			"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index bebec8ef9372..3e53ffa07994 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -101,6 +101,7 @@ ENTRY(cpu_resume)
 	bl	el2_setup		// if in EL2 drop to EL1 cleanly
 	bl	__cpu_setup
 	/* enable the MMU early - so we can access sleep_save_stash by va */
+	adrp	x1, swapper_pg_dir
 	bl	__enable_mmu
 	ldr	x8, =_cpu_resume
 	br	x8
diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c
index 3432e5ef9f41..885f13e58708 100644
--- a/arch/arm64/kernel/ssbd.c
+++ b/arch/arm64/kernel/ssbd.c
@@ -3,17 +3,33 @@
  * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
  */
 
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
 
+static void ssbd_ssbs_enable(struct task_struct *task)
+{
+	u64 val = is_compat_thread(task_thread_info(task)) ?
+		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+	task_pt_regs(task)->pstate |= val;
+}
+
+static void ssbd_ssbs_disable(struct task_struct *task)
+{
+	u64 val = is_compat_thread(task_thread_info(task)) ?
+		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+	task_pt_regs(task)->pstate &= ~val;
+}
+
 /*
  * prctl interface for SSBD
- * FIXME: Drop the below ifdefery once merged in 4.18.
  */
-#ifdef PR_SPEC_STORE_BYPASS
 static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
 	int state = arm64_get_ssbd_state();
@@ -46,12 +62,14 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 			return -EPERM;
 		task_clear_spec_ssb_disable(task);
 		clear_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_enable(task);
 		break;
 	case PR_SPEC_DISABLE:
 		if (state == ARM64_SSBD_FORCE_DISABLE)
 			return -EPERM;
 		task_set_spec_ssb_disable(task);
 		set_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_disable(task);
 		break;
 	case PR_SPEC_FORCE_DISABLE:
 		if (state == ARM64_SSBD_FORCE_DISABLE)
@@ -59,6 +77,7 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 		task_set_spec_ssb_disable(task);
 		task_set_spec_ssb_force_disable(task);
 		set_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_disable(task);
 		break;
 	default:
 		return -ERANGE;
@@ -107,4 +126,3 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 		return -ENODEV;
 	}
 }
-#endif	/* PR_SPEC_STORE_BYPASS */
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 70c283368b64..9405d1b7f4b0 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -48,6 +48,10 @@ void notrace __cpu_suspend_exit(void)
 	 */
 	cpu_uninstall_idmap();
 
+	/* Restore CnP bit in TTBR1_EL1 */
+	if (system_supports_cnp())
+		cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+
 	/*
 	 * PSTATE was not saved over suspend/resume, re-enable any detected
 	 * features that might not have been set correctly.
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index a6109825eeb9..32653d156747 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -68,8 +68,8 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags)
  */
 long compat_arm_syscall(struct pt_regs *regs)
 {
-	siginfo_t info;
 	unsigned int no = regs->regs[7];
+	void __user *addr;
 
 	switch (no) {
 	/*
@@ -112,13 +112,10 @@ long compat_arm_syscall(struct pt_regs *regs)
 		break;
 	}
 
-	clear_siginfo(&info);
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLTRP;
-	info.si_addr  = (void __user *)instruction_pointer(regs) -
-			 (compat_thumb_mode(regs) ? 2 : 4);
+	addr  = (void __user *)instruction_pointer(regs) -
+		(compat_thumb_mode(regs) ? 2 : 4);
 
-	arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, no);
+	arm64_notify_die("Oops - bad compat syscall(2)", regs,
+			 SIGILL, ILL_ILLTRP, addr, no);
 	return 0;
 }
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 039e9ff379cc..5f4d9acb32f5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -224,24 +224,19 @@ void die(const char *str, struct pt_regs *regs, int err)
 		do_exit(SIGSEGV);
 }
 
-static bool show_unhandled_signals_ratelimited(void)
+static void arm64_show_signal(int signo, const char *str)
 {
 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
-	return show_unhandled_signals && __ratelimit(&rs);
-}
-
-void arm64_force_sig_info(struct siginfo *info, const char *str,
-			  struct task_struct *tsk)
-{
+	struct task_struct *tsk = current;
 	unsigned int esr = tsk->thread.fault_code;
 	struct pt_regs *regs = task_pt_regs(tsk);
 
-	if (!unhandled_signal(tsk, info->si_signo))
-		goto send_sig;
-
-	if (!show_unhandled_signals_ratelimited())
-		goto send_sig;
+	/* Leave if the signal won't be shown */
+	if (!show_unhandled_signals ||
+	    !unhandled_signal(tsk, signo) ||
+	    !__ratelimit(&rs))
+		return;
 
 	pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
 	if (esr)
@@ -251,19 +246,39 @@ void arm64_force_sig_info(struct siginfo *info, const char *str,
 	print_vma_addr(KERN_CONT " in ", regs->pc);
 	pr_cont("\n");
 	__show_regs(regs);
+}
 
-send_sig:
-	force_sig_info(info->si_signo, info, tsk);
+void arm64_force_sig_fault(int signo, int code, void __user *addr,
+			   const char *str)
+{
+	arm64_show_signal(signo, str);
+	force_sig_fault(signo, code, addr, current);
+}
+
+void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
+			    const char *str)
+{
+	arm64_show_signal(SIGBUS, str);
+	force_sig_mceerr(code, addr, lsb, current);
+}
+
+void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr,
+				       const char *str)
+{
+	arm64_show_signal(SIGTRAP, str);
+	force_sig_ptrace_errno_trap(errno, addr);
 }
 
 void arm64_notify_die(const char *str, struct pt_regs *regs,
-		      struct siginfo *info, int err)
+		      int signo, int sicode, void __user *addr,
+		      int err)
 {
 	if (user_mode(regs)) {
 		WARN_ON(regs != current_pt_regs());
 		current->thread.fault_address = 0;
 		current->thread.fault_code = err;
-		arm64_force_sig_info(info, str, current);
+
+		arm64_force_sig_fault(signo, sicode, addr, str);
 	} else {
 		die(str, regs, err);
 	}
@@ -310,10 +325,12 @@ static int call_undef_hook(struct pt_regs *regs)
 	int (*fn)(struct pt_regs *regs, u32 instr) = NULL;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
-	if (!user_mode(regs))
-		return 1;
-
-	if (compat_thumb_mode(regs)) {
+	if (!user_mode(regs)) {
+		__le32 instr_le;
+		if (probe_kernel_address((__force __le32 *)pc, instr_le))
+			goto exit;
+		instr = le32_to_cpu(instr_le);
+	} else if (compat_thumb_mode(regs)) {
 		/* 16-bit Thumb instruction */
 		__le16 instr_le;
 		if (get_user(instr_le, (__le16 __user *)pc))
@@ -348,11 +365,11 @@ exit:
 
 void force_signal_inject(int signal, int code, unsigned long address)
 {
-	siginfo_t info;
 	const char *desc;
 	struct pt_regs *regs = current_pt_regs();
 
-	clear_siginfo(&info);
+	if (WARN_ON(!user_mode(regs)))
+		return;
 
 	switch (signal) {
 	case SIGILL:
@@ -372,12 +389,7 @@ void force_signal_inject(int signal, int code, unsigned long address)
 		signal = SIGKILL;
 	}
 
-	info.si_signo = signal;
-	info.si_errno = 0;
-	info.si_code  = code;
-	info.si_addr  = (void __user *)address;
-
-	arm64_notify_die(desc, regs, &info, 0);
+	arm64_notify_die(desc, regs, signal, code, (void __user *)address, 0);
 }
 
 /*
@@ -406,14 +418,10 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
 	if (call_undef_hook(regs) == 0)
 		return;
 
+	BUG_ON(!user_mode(regs));
 	force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
 }
 
-void cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
-{
-	sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0);
-}
-
 #define __user_cache_maint(insn, address, res)			\
 	if (address >= user_addr_max()) {			\
 		res = -EFAULT;					\
@@ -437,7 +445,7 @@ void cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
 static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
 {
 	unsigned long address;
-	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+	int rt = ESR_ELx_SYS64_ISS_RT(esr);
 	int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
 	int ret = 0;
 
@@ -472,7 +480,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
 
 static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
 {
-	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+	int rt = ESR_ELx_SYS64_ISS_RT(esr);
 	unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0);
 
 	pt_regs_write_reg(regs, rt, val);
@@ -482,7 +490,7 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
 
 static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
 {
-	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+	int rt = ESR_ELx_SYS64_ISS_RT(esr);
 
 	pt_regs_write_reg(regs, rt, arch_counter_get_cntvct());
 	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
@@ -490,12 +498,28 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
 
 static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
 {
-	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+	int rt = ESR_ELx_SYS64_ISS_RT(esr);
 
 	pt_regs_write_reg(regs, rt, arch_timer_get_rate());
 	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
 }
 
+static void mrs_handler(unsigned int esr, struct pt_regs *regs)
+{
+	u32 sysreg, rt;
+
+	rt = ESR_ELx_SYS64_ISS_RT(esr);
+	sysreg = esr_sys64_to_sysreg(esr);
+
+	if (do_emulate_mrs(regs, sysreg, rt) != 0)
+		force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+}
+
+static void wfi_handler(unsigned int esr, struct pt_regs *regs)
+{
+	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
+}
+
 struct sys64_hook {
 	unsigned int esr_mask;
 	unsigned int esr_val;
@@ -526,9 +550,176 @@ static struct sys64_hook sys64_hooks[] = {
 		.esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ,
 		.handler = cntfrq_read_handler,
 	},
+	{
+		/* Trap read access to CPUID registers */
+		.esr_mask = ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK,
+		.esr_val = ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL,
+		.handler = mrs_handler,
+	},
+	{
+		/* Trap WFI instructions executed in userspace */
+		.esr_mask = ESR_ELx_WFx_MASK,
+		.esr_val = ESR_ELx_WFx_WFI_VAL,
+		.handler = wfi_handler,
+	},
 	{},
 };
 
+
+#ifdef CONFIG_COMPAT
+#define PSTATE_IT_1_0_SHIFT	25
+#define PSTATE_IT_1_0_MASK	(0x3 << PSTATE_IT_1_0_SHIFT)
+#define PSTATE_IT_7_2_SHIFT	10
+#define PSTATE_IT_7_2_MASK	(0x3f << PSTATE_IT_7_2_SHIFT)
+
+static u32 compat_get_it_state(struct pt_regs *regs)
+{
+	u32 it, pstate = regs->pstate;
+
+	it  = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT;
+	it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2;
+
+	return it;
+}
+
+static void compat_set_it_state(struct pt_regs *regs, u32 it)
+{
+	u32 pstate_it;
+
+	pstate_it  = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK;
+	pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK;
+
+	regs->pstate &= ~PSR_AA32_IT_MASK;
+	regs->pstate |= pstate_it;
+}
+
+static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs)
+{
+	int cond;
+
+	/* Only a T32 instruction can trap without CV being set */
+	if (!(esr & ESR_ELx_CV)) {
+		u32 it;
+
+		it = compat_get_it_state(regs);
+		if (!it)
+			return true;
+
+		cond = it >> 4;
+	} else {
+		cond = (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
+	}
+
+	return aarch32_opcode_cond_checks[cond](regs->pstate);
+}
+
+static void advance_itstate(struct pt_regs *regs)
+{
+	u32 it;
+
+	/* ARM mode */
+	if (!(regs->pstate & PSR_AA32_T_BIT) ||
+	    !(regs->pstate & PSR_AA32_IT_MASK))
+		return;
+
+	it  = compat_get_it_state(regs);
+
+	/*
+	 * If this is the last instruction of the block, wipe the IT
+	 * state. Otherwise advance it.
+	 */
+	if (!(it & 7))
+		it = 0;
+	else
+		it = (it & 0xe0) | ((it << 1) & 0x1f);
+
+	compat_set_it_state(regs, it);
+}
+
+static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs,
+						   unsigned int sz)
+{
+	advance_itstate(regs);
+	arm64_skip_faulting_instruction(regs, sz);
+}
+
+static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+	int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT;
+
+	pt_regs_write_reg(regs, reg, arch_timer_get_rate());
+	arm64_compat_skip_faulting_instruction(regs, 4);
+}
+
+static struct sys64_hook cp15_32_hooks[] = {
+	{
+		.esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK,
+		.esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ,
+		.handler = compat_cntfrq_read_handler,
+	},
+	{},
+};
+
+static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+	int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT;
+	int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT;
+	u64 val = arch_counter_get_cntvct();
+
+	pt_regs_write_reg(regs, rt, lower_32_bits(val));
+	pt_regs_write_reg(regs, rt2, upper_32_bits(val));
+	arm64_compat_skip_faulting_instruction(regs, 4);
+}
+
+static struct sys64_hook cp15_64_hooks[] = {
+	{
+		.esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK,
+		.esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT,
+		.handler = compat_cntvct_read_handler,
+	},
+	{},
+};
+
+asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs)
+{
+	struct sys64_hook *hook, *hook_base;
+
+	if (!cp15_cond_valid(esr, regs)) {
+		/*
+		 * There is no T16 variant of a CP access, so we
+		 * always advance PC by 4 bytes.
+		 */
+		arm64_compat_skip_faulting_instruction(regs, 4);
+		return;
+	}
+
+	switch (ESR_ELx_EC(esr)) {
+	case ESR_ELx_EC_CP15_32:
+		hook_base = cp15_32_hooks;
+		break;
+	case ESR_ELx_EC_CP15_64:
+		hook_base = cp15_64_hooks;
+		break;
+	default:
+		do_undefinstr(regs);
+		return;
+	}
+
+	for (hook = hook_base; hook->handler; hook++)
+		if ((hook->esr_mask & esr) == hook->esr_val) {
+			hook->handler(esr, regs);
+			return;
+		}
+
+	/*
+	 * New cp15 instructions may previously have been undefined at
+	 * EL0. Fall back to our usual undefined instruction handler
+	 * so that we handle these consistently.
+	 */
+	do_undefinstr(regs);
+}
+#endif
+
 asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs)
 {
 	struct sys64_hook *hook;
@@ -605,7 +796,6 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
 		handler[reason], smp_processor_id(), esr,
 		esr_get_class_string(esr));
 
-	die("Oops - bad mode", regs, 0);
 	local_daif_mask();
 	panic("bad mode");
 }
@@ -616,19 +806,13 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
  */
 asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 {
-	siginfo_t info;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
-	clear_siginfo(&info);
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLOPC;
-	info.si_addr  = pc;
-
 	current->thread.fault_address = 0;
 	current->thread.fault_code = esr;
 
-	arm64_force_sig_info(&info, "Bad EL0 synchronous exception", current);
+	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc,
+			      "Bad EL0 synchronous exception");
 }
 
 #ifdef CONFIG_VMAP_STACK
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 605d1b60469c..03b00007553d 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -138,6 +138,23 @@ SECTIONS
 	EXCEPTION_TABLE(8)		/* __init_begin will be marked RO NX */
 	NOTES
 
+	. = ALIGN(PAGE_SIZE);
+	idmap_pg_dir = .;
+	. += IDMAP_DIR_SIZE;
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	tramp_pg_dir = .;
+	. += PAGE_SIZE;
+#endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	reserved_ttbr0 = .;
+	. += RESERVED_TTBR0_SIZE;
+#endif
+	swapper_pg_dir = .;
+	. += PAGE_SIZE;
+	swapper_pg_end = .;
+
 	. = ALIGN(SEGMENT_ALIGN);
 	__init_begin = .;
 	__inittext_begin = .;
@@ -166,7 +183,6 @@ SECTIONS
 		INIT_SETUP(16)
 		INIT_CALLS
 		CON_INITCALL
-		SECURITY_INITCALL
 		INIT_RAM_FS
 		*(.init.rodata.* .init.bss)	/* from the EFI stub */
 	}
@@ -216,21 +232,9 @@ SECTIONS
 	BSS_SECTION(0, 0, 0)
 
 	. = ALIGN(PAGE_SIZE);
-	idmap_pg_dir = .;
-	. += IDMAP_DIR_SIZE;
-
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-	tramp_pg_dir = .;
-	. += PAGE_SIZE;
-#endif
-
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	reserved_ttbr0 = .;
-	. += RESERVED_TTBR0_SIZE;
-#endif
-	swapper_pg_dir = .;
-	. += SWAPPER_DIR_SIZE;
-	swapper_pg_end = .;
+	init_pg_dir = .;
+	. += INIT_DIR_SIZE;
+	init_pg_end = .;
 
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
 	_end = .;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 07256b08226c..dd436a50fce7 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -57,6 +57,45 @@ static u64 core_reg_offset_from_id(u64 id)
 	return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
 }
 
+static int validate_core_offset(const struct kvm_one_reg *reg)
+{
+	u64 off = core_reg_offset_from_id(reg->id);
+	int size;
+
+	switch (off) {
+	case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
+	     KVM_REG_ARM_CORE_REG(regs.regs[30]):
+	case KVM_REG_ARM_CORE_REG(regs.sp):
+	case KVM_REG_ARM_CORE_REG(regs.pc):
+	case KVM_REG_ARM_CORE_REG(regs.pstate):
+	case KVM_REG_ARM_CORE_REG(sp_el1):
+	case KVM_REG_ARM_CORE_REG(elr_el1):
+	case KVM_REG_ARM_CORE_REG(spsr[0]) ...
+	     KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
+		size = sizeof(__u64);
+		break;
+
+	case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+	     KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+		size = sizeof(__uint128_t);
+		break;
+
+	case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+	case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+		size = sizeof(__u32);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (KVM_REG_SIZE(reg->id) == size &&
+	    IS_ALIGNED(off, size / sizeof(__u32)))
+		return 0;
+
+	return -EINVAL;
+}
+
 static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
 	/*
@@ -76,6 +115,9 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	    (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
 		return -ENOENT;
 
+	if (validate_core_offset(reg))
+		return -EINVAL;
+
 	if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id)))
 		return -EFAULT;
 
@@ -98,6 +140,9 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	    (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
 		return -ENOENT;
 
+	if (validate_core_offset(reg))
+		return -EINVAL;
+
 	if (KVM_REG_SIZE(reg->id) > sizeof(tmp))
 		return -EINVAL;
 
@@ -107,17 +152,25 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	}
 
 	if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) {
-		u32 mode = (*(u32 *)valp) & PSR_AA32_MODE_MASK;
+		u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK;
 		switch (mode) {
 		case PSR_AA32_MODE_USR:
+			if (!system_supports_32bit_el0())
+				return -EINVAL;
+			break;
 		case PSR_AA32_MODE_FIQ:
 		case PSR_AA32_MODE_IRQ:
 		case PSR_AA32_MODE_SVC:
 		case PSR_AA32_MODE_ABT:
 		case PSR_AA32_MODE_UND:
+			if (!vcpu_el1_is_32bit(vcpu))
+				return -EINVAL;
+			break;
 		case PSR_MODE_EL0t:
 		case PSR_MODE_EL1t:
 		case PSR_MODE_EL1h:
+			if (vcpu_el1_is_32bit(vcpu))
+				return -EINVAL;
 			break;
 		default:
 			err = -EINVAL;
@@ -338,15 +391,15 @@ int __attribute_const__ kvm_target_cpu(void)
 			return KVM_ARM_TARGET_CORTEX_A53;
 		case ARM_CPU_PART_CORTEX_A57:
 			return KVM_ARM_TARGET_CORTEX_A57;
-		};
+		}
 		break;
 	case ARM_CPU_IMP_APM:
 		switch (part_number) {
 		case APM_CPU_PART_POTENZA:
 			return KVM_ARM_TARGET_XGENE_POTENZA;
-		};
+		}
 		break;
-	};
+	}
 
 	/* Return a default generic target */
 	return KVM_ARM_TARGET_GENERIC_V8;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e5e741bfffe1..35a81bebd02b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		 */
 		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		return 0;
+	case ARM_EXCEPTION_IL:
+		/*
+		 * We attempted an illegal exception return.  Guest state must
+		 * have been corrupted somehow.  Give up.
+		 */
+		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		return -EINVAL;
 	default:
 		kvm_pr_unimpl("Unsupported exception type: %d",
 			      exception_index);
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index ea9225160786..4576b86a5579 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -65,6 +65,9 @@ __do_hyp_init:
 	b.lo	__kvm_handle_stub_hvc
 
 	phys_to_ttbr x4, x0
+alternative_if ARM64_HAS_CNP
+	orr	x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
 	msr	ttbr0_el2, x4
 
 	mrs	x4, tcr_el1
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 2fabc2dc1966..82d1904328ad 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
 
 # KVM code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 24b4fbafe3e4..b1f14f736962 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -162,6 +162,20 @@ el1_error:
 	mov	x0, #ARM_EXCEPTION_EL1_SERROR
 	b	__guest_exit
 
+el2_sync:
+	/* Check for illegal exception return, otherwise panic */
+	mrs	x0, spsr_el2
+
+	/* if this was something else, then panic! */
+	tst	x0, #PSR_IL_BIT
+	b.eq	__hyp_panic
+
+	/* Let's attempt a recovery from the illegal exception return */
+	get_vcpu_ptr	x1, x0
+	mov	x0, #ARM_EXCEPTION_IL
+	b	__guest_exit
+
+
 el2_error:
 	ldp	x0, x1, [sp], #16
 
@@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector)
 	invalid_vect	el2t_fiq_invalid	// FIQ EL2t
 	invalid_vect	el2t_error_invalid	// Error EL2t
 
-	invalid_vect	el2h_sync_invalid	// Synchronous EL2h
+	valid_vect	el2_sync		// Synchronous EL2h
 	invalid_vect	el2h_irq_invalid	// IRQ EL2h
 	invalid_vect	el2h_fiq_invalid	// FIQ EL2h
 	valid_vect	el2_error		// Error EL2h
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
deleted file mode 100644
index 603e1ee83e89..000000000000
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/types.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_hyp.h>
-
-u32 __hyp_text __init_stage2_translation(void)
-{
-	u64 val = VTCR_EL2_FLAGS;
-	u64 parange;
-	u64 tmp;
-
-	/*
-	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
-	 * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
-	 * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
-	 */
-	parange = read_sysreg(id_aa64mmfr0_el1) & 7;
-	if (parange > ID_AA64MMFR0_PARANGE_MAX)
-		parange = ID_AA64MMFR0_PARANGE_MAX;
-	val |= parange << 16;
-
-	/* Compute the actual PARange... */
-	switch (parange) {
-	case 0:
-		parange = 32;
-		break;
-	case 1:
-		parange = 36;
-		break;
-	case 2:
-		parange = 40;
-		break;
-	case 3:
-		parange = 42;
-		break;
-	case 4:
-		parange = 44;
-		break;
-	case 5:
-	default:
-		parange = 48;
-		break;
-	}
-
-	/*
-	 * ... and clamp it to 40 bits, unless we have some braindead
-	 * HW that implements less than that. In all cases, we'll
-	 * return that value for the rest of the kernel to decide what
-	 * to do.
-	 */
-	val |= 64 - (parange > 40 ? 40 : parange);
-
-	/*
-	 * Check the availability of Hardware Access Flag / Dirty Bit
-	 * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
-	 */
-	tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
-	if (tmp)
-		val |= VTCR_EL2_HA;
-
-	/*
-	 * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
-	 * bit in VTCR_EL2.
-	 */
-	tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf;
-	val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ?
-			VTCR_EL2_VS_16BIT :
-			VTCR_EL2_VS_8BIT;
-
-	write_sysreg(val, vtcr_el2);
-
-	return parange;
-}
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index ca46153d7915..7cc175c88a37 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void)
 
 static void __hyp_text __activate_vm(struct kvm *kvm)
 {
-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
+	__load_guest_stage2(kvm);
 }
 
 static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
 		return false; /* Translation failed, back to guest */
 
 	/* Convert PAR to HPFAR format */
-	*hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
+	*hpfar = PAR_TO_HPFAR(tmp);
 	return true;
 }
 
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 9ce223944983..68d6f7c3b237 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 static void __hyp_text
 __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
 {
+	u64 pstate = ctxt->gp_regs.regs.pstate;
+	u64 mode = pstate & PSR_AA32_MODE_MASK;
+
+	/*
+	 * Safety check to ensure we're setting the CPU up to enter the guest
+	 * in a less privileged mode.
+	 *
+	 * If we are attempting a return to EL2 or higher in AArch64 state,
+	 * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
+	 * we'll take an illegal exception state exception immediately after
+	 * the ERET to the guest.  Attempts to return to AArch32 Hyp will
+	 * result in an illegal exception return because EL2's execution state
+	 * is determined by SCR_EL3.RW.
+	 */
+	if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
+		pstate = PSR_MODE_EL2h | PSR_IL_BIT;
+
 	write_sysreg_el2(ctxt->gp_regs.regs.pc,		elr);
-	write_sysreg_el2(ctxt->gp_regs.regs.pstate,	spsr);
+	write_sysreg_el2(pstate,			spsr);
 
 	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
 		write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
@@ -288,3 +305,14 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.sysregs_loaded_on_cpu = false;
 }
+
+void __hyp_text __kvm_enable_ssbs(void)
+{
+	u64 tmp;
+
+	asm volatile(
+	"mrs	%0, sctlr_el2\n"
+	"orr	%0, %0, %1\n"
+	"msr	sctlr_el2, %0"
+	: "=&r" (tmp) : "L" (SCTLR_ELx_DSSBS));
+}
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 131c7772703c..4dbd9c69a96d 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
 	 * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
 	 * let's flip TGE before executing the TLB operation.
 	 */
-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
+	__load_guest_stage2(kvm);
 	val = read_sysreg(hcr_el2);
 	val &= ~HCR_TGE;
 	write_sysreg(val, hcr_el2);
@@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
 
 static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm)
 {
-	write_sysreg(kvm->arch.vttbr, vttbr_el2);
+	__load_guest_stage2(kvm);
 	isb();
 }
 
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e37c78bbe1ca..b72a3dd56204 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -26,6 +26,7 @@
 
 #include <kvm/arm_arch_timer.h>
 
+#include <asm/cpufeature.h>
 #include <asm/cputype.h>
 #include <asm/ptrace.h>
 #include <asm/kvm_arm.h>
@@ -33,6 +34,9 @@
 #include <asm/kvm_coproc.h>
 #include <asm/kvm_mmu.h>
 
+/* Maximum phys_shift supported for any VM on this host */
+static u32 kvm_ipa_limit;
+
 /*
  * ARMv8 Reset Values
  */
@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void)
 }
 
 /**
- * kvm_arch_dev_ioctl_check_extension
+ * kvm_arch_vm_ioctl_check_extension
  *
  * We currently assume that the number of HW registers is uniform
  * across all CPUs (see cpuinfo_sanity_check).
  */
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	int r;
 
@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
-	case KVM_CAP_VCPU_EVENTS:
 		r = 1;
 		break;
+	case KVM_CAP_ARM_VM_IPA_SIZE:
+		r = kvm_ipa_limit;
+		break;
 	default:
 		r = 0;
 	}
@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset timer */
 	return kvm_timer_vcpu_reset(vcpu);
 }
+
+void kvm_set_ipa_limit(void)
+{
+	unsigned int ipa_max, pa_max, va_max, parange;
+
+	parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
+	pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+	/* Clamp the IPA limit to the PA size supported by the kernel */
+	ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
+	/*
+	 * Since our stage2 table is dependent on the stage1 page table code,
+	 * we must always honor the following condition:
+	 *
+	 *  Number of levels in Stage1 >= Number of levels in Stage2.
+	 *
+	 * So clamp the ipa limit further down to limit the number of levels.
+	 * Since we can concatenate upto 16 tables at entry level, we could
+	 * go upto 4bits above the maximum VA addressible with the current
+	 * number of levels.
+	 */
+	va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
+	va_max += 4;
+
+	if (va_max < ipa_max)
+		ipa_max = va_max;
+
+	/*
+	 * If the final limit is lower than the real physical address
+	 * limit of the CPUs, report the reason.
+	 */
+	if (ipa_max < pa_max)
+		pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
+			(va_max < pa_max) ? "Virtual" : "Physical");
+
+	WARN(ipa_max < KVM_PHYS_SHIFT,
+	     "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
+	kvm_ipa_limit = ipa_max;
+	kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+}
+
+/*
+ * Configure the VTCR_EL2 for this VM. The VTCR value is common
+ * across all the physical CPUs on the system. We use system wide
+ * sanitised values to fill in different fields, except for Hardware
+ * Management of Access Flags. HA Flag is set unconditionally on
+ * all CPUs, as it is safe to run with or without the feature and
+ * the bit is RES0 on CPUs that don't support it.
+ */
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+	u64 vtcr = VTCR_EL2_FLAGS;
+	u32 parange, phys_shift;
+	u8 lvls;
+
+	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+		return -EINVAL;
+
+	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+	if (phys_shift) {
+		if (phys_shift > kvm_ipa_limit ||
+		    phys_shift < 32)
+			return -EINVAL;
+	} else {
+		phys_shift = KVM_PHYS_SHIFT;
+	}
+
+	parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
+	if (parange > ID_AA64MMFR0_PARANGE_MAX)
+		parange = ID_AA64MMFR0_PARANGE_MAX;
+	vtcr |= parange << VTCR_EL2_PS_SHIFT;
+
+	vtcr |= VTCR_EL2_T0SZ(phys_shift);
+	/*
+	 * Use a minimum 2 level page table to prevent splitting
+	 * host PMD huge pages at stage2.
+	 */
+	lvls = stage2_pgtable_levels(phys_shift);
+	if (lvls < 2)
+		lvls = 2;
+	vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+	/*
+	 * Enable the Hardware Access Flag management, unconditionally
+	 * on all CPUs. The features is RES0 on CPUs without the support
+	 * and must be ignored by the CPUs.
+	 */
+	vtcr |= VTCR_EL2_HA;
+
+	/* Set the vmid bits */
+	vtcr |= (kvm_get_vmid_bits() == 16) ?
+		VTCR_EL2_VS_16BIT :
+		VTCR_EL2_VS_8BIT;
+	kvm->arch.vtcr = vtcr;
+	return 0;
+}
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 68755fd70dcf..69ff9887f724 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -12,7 +12,7 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
 # when supported by the CPU. Result and argument registers are handled
 # correctly, based on the function prototype.
 lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o
-CFLAGS_atomic_ll_sc.o	:= -fcall-used-x0 -ffixed-x1 -ffixed-x2		\
+CFLAGS_atomic_ll_sc.o	:= -ffixed-x1 -ffixed-x2        		\
 		   -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6		\
 		   -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9		\
 		   -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12	\
@@ -25,3 +25,5 @@ KCOV_INSTRUMENT_atomic_ll_sc.o	:= n
 UBSAN_SANITIZE_atomic_ll_sc.o	:= n
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
+
+obj-$(CONFIG_CRC32) += crc32.o
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
new file mode 100644
index 000000000000..5bc1e85b4e1c
--- /dev/null
+++ b/arch/arm64/lib/crc32.S
@@ -0,0 +1,60 @@
+/*
+ * Accelerated CRC32(C) using AArch64 CRC instructions
+ *
+ * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/assembler.h>
+
+	.cpu		generic+crc
+
+	.macro		__crc32, c
+0:	subs		x2, x2, #16
+	b.mi		8f
+	ldp		x3, x4, [x1], #16
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+	crc32\c\()x	w0, w0, x3
+	crc32\c\()x	w0, w0, x4
+	b.ne		0b
+	ret
+
+8:	tbz		x2, #3, 4f
+	ldr		x3, [x1], #8
+CPU_BE(	rev		x3, x3		)
+	crc32\c\()x	w0, w0, x3
+4:	tbz		x2, #2, 2f
+	ldr		w3, [x1], #4
+CPU_BE(	rev		w3, w3		)
+	crc32\c\()w	w0, w0, w3
+2:	tbz		x2, #1, 1f
+	ldrh		w3, [x1], #2
+CPU_BE(	rev16		w3, w3		)
+	crc32\c\()h	w0, w0, w3
+1:	tbz		x2, #0, 0f
+	ldrb		w3, [x1]
+	crc32\c\()b	w0, w0, w3
+0:	ret
+	.endm
+
+	.align		5
+ENTRY(crc32_le)
+alternative_if_not ARM64_HAS_CRC32
+	b		crc32_le_base
+alternative_else_nop_endif
+	__crc32
+ENDPROC(crc32_le)
+
+	.align		5
+ENTRY(__crc32c_le)
+alternative_if_not ARM64_HAS_CRC32
+	b		__crc32c_le_base
+alternative_else_nop_endif
+	__crc32		c
+ENDPROC(__crc32c_le)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index c127f94da8e2..1f0ea2facf24 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -88,7 +88,7 @@ void verify_cpu_asid_bits(void)
 	}
 }
 
-static void flush_context(unsigned int cpu)
+static void flush_context(void)
 {
 	int i;
 	u64 asid;
@@ -142,7 +142,7 @@ static bool check_update_reserved_asid(u64 asid, u64 newasid)
 	return hit;
 }
 
-static u64 new_context(struct mm_struct *mm, unsigned int cpu)
+static u64 new_context(struct mm_struct *mm)
 {
 	static u32 cur_idx = 1;
 	u64 asid = atomic64_read(&mm->context.id);
@@ -180,7 +180,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 	/* We're out of ASIDs, so increment the global generation count */
 	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION,
 						 &asid_generation);
-	flush_context(cpu);
+	flush_context();
 
 	/* We have more ASIDs than CPUs, so this will always succeed */
 	asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
@@ -196,6 +196,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	unsigned long flags;
 	u64 asid, old_active_asid;
 
+	if (system_supports_cnp())
+		cpu_set_reserved_ttbr0();
+
 	asid = atomic64_read(&mm->context.id);
 
 	/*
@@ -223,7 +226,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	/* Check that our ASID belongs to the current generation. */
 	asid = atomic64_read(&mm->context.id);
 	if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) {
-		asid = new_context(mm, cpu);
+		asid = new_context(mm);
 		atomic64_set(&mm->context.id, asid);
 	}
 
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 65dfc8571bf8..fcb1f2a6d7c6 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -36,8 +36,8 @@ static const struct addr_marker address_markers[] = {
 #endif
 	{ MODULES_VADDR,		"Modules start" },
 	{ MODULES_END,			"Modules end" },
-	{ VMALLOC_START,		"vmalloc() Area" },
-	{ VMALLOC_END,			"vmalloc() End" },
+	{ VMALLOC_START,		"vmalloc() area" },
+	{ VMALLOC_END,			"vmalloc() end" },
 	{ FIXADDR_START,		"Fixmap start" },
 	{ FIXADDR_TOP,			"Fixmap end" },
 	{ PCI_IO_START,			"PCI I/O start" },
@@ -46,7 +46,7 @@ static const struct addr_marker address_markers[] = {
 	{ VMEMMAP_START,		"vmemmap start" },
 	{ VMEMMAP_START + VMEMMAP_SIZE,	"vmemmap end" },
 #endif
-	{ PAGE_OFFSET,			"Linear Mapping" },
+	{ PAGE_OFFSET,			"Linear mapping" },
 	{ -1,				NULL },
 };
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 50b30ff30de4..7d9571f4ae3d 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -37,6 +37,7 @@
 #include <asm/cmpxchg.h>
 #include <asm/cpufeature.h>
 #include <asm/exception.h>
+#include <asm/daifflags.h>
 #include <asm/debug-monitors.h>
 #include <asm/esr.h>
 #include <asm/sysreg.h>
@@ -56,10 +57,16 @@ struct fault_info {
 };
 
 static const struct fault_info fault_info[];
+static struct fault_info debug_fault_info[];
 
 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
 {
-	return fault_info + (esr & 63);
+	return fault_info + (esr & ESR_ELx_FSC);
+}
+
+static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
+{
+	return debug_fault_info + DBG_ESR_EVT(esr);
 }
 
 #ifdef CONFIG_KPROBES
@@ -235,9 +242,8 @@ static bool is_el1_instruction_abort(unsigned int esr)
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
 }
 
-static inline bool is_el1_permission_fault(unsigned int esr,
-					   struct pt_regs *regs,
-					   unsigned long addr)
+static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
+					   struct pt_regs *regs)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
@@ -283,7 +289,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
 		return;
 
-	if (is_el1_permission_fault(esr, regs, addr)) {
+	if (is_el1_permission_fault(addr, esr, regs)) {
 		if (esr & ESR_ELx_WNR)
 			msg = "write to read-only memory";
 		else
@@ -297,9 +303,9 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void __do_user_fault(struct siginfo *info, unsigned int esr)
+static void set_thread_esr(unsigned long address, unsigned int esr)
 {
-	current->thread.fault_address = (unsigned long)info->si_addr;
+	current->thread.fault_address = address;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -352,7 +358,6 @@ static void __do_user_fault(struct siginfo *info, unsigned int esr)
 	}
 
 	current->thread.fault_code = esr;
-	arm64_force_sig_info(info, esr_to_fault_info(esr)->name, current);
 }
 
 static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
@@ -363,14 +368,10 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	 */
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
-		struct siginfo si;
-
-		clear_siginfo(&si);
-		si.si_signo	= inf->sig;
-		si.si_code	= inf->code;
-		si.si_addr	= (void __user *)addr;
 
-		__do_user_fault(&si, esr);
+		set_thread_esr(addr, esr);
+		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
+				      inf->name);
 	} else {
 		__do_kernel_fault(addr, esr, regs);
 	}
@@ -424,9 +425,9 @@ static bool is_el0_instruction_abort(unsigned int esr)
 static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				   struct pt_regs *regs)
 {
+	const struct fault_info *inf;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
-	struct siginfo si;
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_READ | VM_WRITE;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -454,7 +455,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (addr < TASK_SIZE && is_el1_permission_fault(esr, regs, addr)) {
+	if (addr < TASK_SIZE && is_el1_permission_fault(addr, esr, regs)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die_kernel_fault("access to user memory with fs=KERNEL_DS",
@@ -562,37 +563,35 @@ retry:
 		return 0;
 	}
 
-	clear_siginfo(&si);
-	si.si_addr = (void __user *)addr;
-
+	inf = esr_to_fault_info(esr);
+	set_thread_esr(addr, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
 		 * this page fault.
 		 */
-		si.si_signo	= SIGBUS;
-		si.si_code	= BUS_ADRERR;
-	} else if (fault & VM_FAULT_HWPOISON_LARGE) {
-		unsigned int hindex = VM_FAULT_GET_HINDEX(fault);
-
-		si.si_signo	= SIGBUS;
-		si.si_code	= BUS_MCEERR_AR;
-		si.si_addr_lsb	= hstate_index_to_shift(hindex);
-	} else if (fault & VM_FAULT_HWPOISON) {
-		si.si_signo	= SIGBUS;
-		si.si_code	= BUS_MCEERR_AR;
-		si.si_addr_lsb	= PAGE_SHIFT;
+		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
+				      inf->name);
+	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
+		unsigned int lsb;
+
+		lsb = PAGE_SHIFT;
+		if (fault & VM_FAULT_HWPOISON_LARGE)
+			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+
+		arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,
+				       inf->name);
 	} else {
 		/*
 		 * Something tried to access memory that isn't in our memory
 		 * map.
 		 */
-		si.si_signo	= SIGSEGV;
-		si.si_code	= fault == VM_FAULT_BADACCESS ?
-				  SEGV_ACCERR : SEGV_MAPERR;
+		arm64_force_sig_fault(SIGSEGV,
+				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
+				      (void __user *)addr,
+				      inf->name);
 	}
 
-	__do_user_fault(&si, esr);
 	return 0;
 
 no_context:
@@ -625,8 +624,8 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 
 static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
-	struct siginfo info;
 	const struct fault_info *inf;
+	void __user *siaddr;
 
 	inf = esr_to_fault_info(esr);
 
@@ -645,15 +644,11 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 			nmi_exit();
 	}
 
-	clear_siginfo(&info);
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
 	if (esr & ESR_ELx_FnV)
-		info.si_addr = NULL;
+		siaddr = NULL;
 	else
-		info.si_addr  = (void __user *)addr;
-	arm64_notify_die(inf->name, regs, &info, esr);
+		siaddr  = (void __user *)addr;
+	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
 }
@@ -734,7 +729,6 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 					 struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
-	struct siginfo info;
 
 	if (!inf->fn(addr, esr, regs))
 		return;
@@ -745,12 +739,8 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
 		show_pte(addr);
 	}
 
-	clear_siginfo(&info);
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
-	info.si_addr  = (void __user *)addr;
-	arm64_notify_die(inf->name, regs, &info, esr);
+	arm64_notify_die(inf->name, regs,
+			 inf->sig, inf->code, (void __user *)addr, esr);
 }
 
 asmlinkage void __exception do_el0_irq_bp_hardening(void)
@@ -771,7 +761,7 @@ asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
 	if (addr > TASK_SIZE)
 		arm64_apply_bp_hardening();
 
-	local_irq_enable();
+	local_daif_restore(DAIF_PROCCTX);
 	do_mem_abort(addr, esr, regs);
 }
 
@@ -780,20 +770,14 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 					   unsigned int esr,
 					   struct pt_regs *regs)
 {
-	struct siginfo info;
-
 	if (user_mode(regs)) {
 		if (instruction_pointer(regs) > TASK_SIZE)
 			arm64_apply_bp_hardening();
-		local_irq_enable();
+		local_daif_restore(DAIF_PROCCTX);
 	}
 
-	clear_siginfo(&info);
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code  = BUS_ADRALN;
-	info.si_addr  = (void __user *)addr;
-	arm64_notify_die("SP/PC alignment exception", regs, &info, esr);
+	arm64_notify_die("SP/PC alignment exception", regs,
+			 SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
 }
 
 int __init early_brk64(unsigned long addr, unsigned int esr,
@@ -831,7 +815,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 					      unsigned int esr,
 					      struct pt_regs *regs)
 {
-	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
+	const struct fault_info *inf = esr_to_debug_fault_info(esr);
 	int rv;
 
 	/*
@@ -847,14 +831,8 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	if (!inf->fn(addr, esr, regs)) {
 		rv = 1;
 	} else {
-		struct siginfo info;
-
-		clear_siginfo(&info);
-		info.si_signo = inf->sig;
-		info.si_errno = 0;
-		info.si_code  = inf->code;
-		info.si_addr  = (void __user *)addr;
-		arm64_notify_die(inf->name, regs, &info, esr);
+		arm64_notify_die(inf->name, regs,
+				 inf->sig, inf->code, (void __user *)addr, esr);
 		rv = 0;
 	}
 
@@ -864,17 +842,3 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	return rv;
 }
 NOKPROBE_SYMBOL(do_debug_exception);
-
-#ifdef CONFIG_ARM64_PAN
-void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
-{
-	/*
-	 * We modify PSTATE. This won't work from irq context as the PSTATE
-	 * is discarded once we return from the exception.
-	 */
-	WARN_ON_ONCE(in_interrupt());
-
-	sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0);
-	asm(SET_PSTATE_PAN(1));
-}
-#endif /* CONFIG_ARM64_PAN */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 192b3ba07075..f58ea503ad01 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -117,11 +117,14 @@ static pte_t get_clear_flush(struct mm_struct *mm,
 
 		/*
 		 * If HW_AFDBM is enabled, then the HW could turn on
-		 * the dirty bit for any page in the set, so check
-		 * them all.  All hugetlb entries are already young.
+		 * the dirty or accessed bit for any page in the set,
+		 * so check them all.
 		 */
 		if (pte_dirty(pte))
 			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
 	}
 
 	if (valid) {
@@ -320,11 +323,40 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	return get_clear_flush(mm, addr, ptep, pgsize, ncontig);
 }
 
+/*
+ * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
+ * and write permission.
+ *
+ * For a contiguous huge pte range we need to check whether or not write
+ * permission has to change only on the first pte in the set. Then for
+ * all the contiguous ptes we need to check whether or not there is a
+ * discrepancy between dirty or young.
+ */
+static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
+{
+	int i;
+
+	if (pte_write(pte) != pte_write(huge_ptep_get(ptep)))
+		return 1;
+
+	for (i = 0; i < ncontig; i++) {
+		pte_t orig_pte = huge_ptep_get(ptep + i);
+
+		if (pte_dirty(pte) != pte_dirty(orig_pte))
+			return 1;
+
+		if (pte_young(pte) != pte_young(orig_pte))
+			return 1;
+	}
+
+	return 0;
+}
+
 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			       unsigned long addr, pte_t *ptep,
 			       pte_t pte, int dirty)
 {
-	int ncontig, i, changed = 0;
+	int ncontig, i;
 	size_t pgsize = 0;
 	unsigned long pfn = pte_pfn(pte), dpfn;
 	pgprot_t hugeprot;
@@ -336,19 +368,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
 	dpfn = pgsize >> PAGE_SHIFT;
 
+	if (!__cont_access_flags_changed(ptep, pte, ncontig))
+		return 0;
+
 	orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
-	if (!pte_same(orig_pte, pte))
-		changed = 1;
 
-	/* Make sure we don't lose the dirty state */
+	/* Make sure we don't lose the dirty or young state */
 	if (pte_dirty(orig_pte))
 		pte = pte_mkdirty(pte);
 
+	if (pte_young(orig_pte))
+		pte = pte_mkyoung(pte);
+
 	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
 		set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot));
 
-	return changed;
+	return 1;
 }
 
 void huge_ptep_set_wrprotect(struct mm_struct *mm,
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 787e27964ab9..3cf87341859f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -284,7 +284,6 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 
 #endif /* CONFIG_NUMA */
 
-#ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
 	phys_addr_t addr = pfn << PAGE_SHIFT;
@@ -294,7 +293,6 @@ int pfn_valid(unsigned long pfn)
 	return memblock_is_map_memory(addr);
 }
 EXPORT_SYMBOL(pfn_valid);
-#endif
 
 #ifndef CONFIG_SPARSEMEM
 static void __init arm64_memory_present(void)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 12145874c02b..fccb1a6f8c6f 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -192,7 +192,7 @@ void __init kasan_init(void)
 
 	/*
 	 * We are going to perform proper setup of shadow memory.
-	 * At first we should unmap early shadow (clear_pgds() call bellow).
+	 * At first we should unmap early shadow (clear_pgds() call below).
 	 * However, instrumented code couldn't execute without shadow memory.
 	 * tmp_pg_dir used to keep early shadow mapped until full shadow
 	 * setup will be finished.
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8080c9f489c3..9498c15b847b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -67,6 +67,24 @@ static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
 
+static DEFINE_SPINLOCK(swapper_pgdir_lock);
+
+void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	pgd_t *fixmap_pgdp;
+
+	spin_lock(&swapper_pgdir_lock);
+	fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
+	WRITE_ONCE(*fixmap_pgdp, pgd);
+	/*
+	 * We need dsb(ishst) here to ensure the page-table-walker sees
+	 * our new entry before set_p?d() returns. The fixmap's
+	 * flush_tlb_kernel_range() via clear_fixmap() does this for us.
+	 */
+	pgd_clear_fixmap();
+	spin_unlock(&swapper_pgdir_lock);
+}
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 			      unsigned long size, pgprot_t vma_prot)
 {
@@ -629,34 +647,18 @@ static void __init map_kernel(pgd_t *pgdp)
  */
 void __init paging_init(void)
 {
-	phys_addr_t pgd_phys = early_pgtable_alloc();
-	pgd_t *pgdp = pgd_set_fixmap(pgd_phys);
+	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
 
 	map_kernel(pgdp);
 	map_mem(pgdp);
 
-	/*
-	 * We want to reuse the original swapper_pg_dir so we don't have to
-	 * communicate the new address to non-coherent secondaries in
-	 * secondary_entry, and so cpu_switch_mm can generate the address with
-	 * adrp+add rather than a load from some global variable.
-	 *
-	 * To do this we need to go via a temporary pgd.
-	 */
-	cpu_replace_ttbr1(__va(pgd_phys));
-	memcpy(swapper_pg_dir, pgdp, PGD_SIZE);
-	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
-
 	pgd_clear_fixmap();
-	memblock_free(pgd_phys, PAGE_SIZE);
 
-	/*
-	 * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
-	 * allocated with it.
-	 */
-	memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE,
-		      __pa_symbol(swapper_pg_end) - __pa_symbol(swapper_pg_dir)
-		      - PAGE_SIZE);
+	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+	init_mm.pgd = swapper_pg_dir;
+
+	memblock_free(__pa_symbol(init_pg_dir),
+		      __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
 }
 
 /*
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 146c04ceaa51..d7b66fc5e1c5 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -391,7 +391,6 @@ static int __init numa_init(int (*init_func)(void))
 	nodes_clear(numa_nodes_parsed);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
-	numa_free_distance();
 
 	ret = numa_alloc_distance();
 	if (ret < 0)
@@ -399,20 +398,24 @@ static int __init numa_init(int (*init_func)(void))
 
 	ret = init_func();
 	if (ret < 0)
-		return ret;
+		goto out_free_distance;
 
 	if (nodes_empty(numa_nodes_parsed)) {
 		pr_info("No NUMA configuration found\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_free_distance;
 	}
 
 	ret = numa_register_nodes();
 	if (ret < 0)
-		return ret;
+		goto out_free_distance;
 
 	setup_node_to_cpumask_map();
 
 	return 0;
+out_free_distance:
+	numa_free_distance();
+	return ret;
 }
 
 /**
@@ -432,7 +435,7 @@ static int __init dummy_numa_init(void)
 	if (numa_off)
 		pr_info("NUMA disabled\n"); /* Forced off on command line. */
 	pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
-		0LLU, PFN_PHYS(max_pfn) - 1);
+		memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
 
 	for_each_memblock(memory, mblk) {
 		ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 03646e6a2ef4..2c75b0b903ae 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -160,6 +160,12 @@ ENTRY(cpu_do_switch_mm)
 	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
 	phys_to_ttbr x3, x0
+
+alternative_if ARM64_HAS_CNP
+	cbz     x1, 1f                          // skip CNP for reserved ASID
+	orr     x3, x3, #TTBR_CNP_BIT
+1:
+alternative_else_nop_endif
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	bfi	x3, x1, #48, #16		// set the ASID field in TTBR0
 #endif
@@ -184,7 +190,7 @@ ENDPROC(cpu_do_switch_mm)
 .endm
 
 /*
- * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd)
+ * void idmap_cpu_replace_ttbr1(phys_addr_t ttbr1)
  *
  * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
  * called by anything else. It can only be executed from a TTBR0 mapping.
@@ -194,8 +200,7 @@ ENTRY(idmap_cpu_replace_ttbr1)
 
 	__idmap_cpu_set_reserved_ttbr1 x1, x3
 
-	phys_to_ttbr x3, x0
-	msr	ttbr1_el1, x3
+	msr	ttbr1_el1, x0
 	isb
 
 	restore_daif x2