summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig54
-rw-r--r--arch/arm64/Makefile5
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts1
-rw-r--r--arch/arm64/crypto/Kconfig17
-rw-r--r--arch/arm64/crypto/Makefile10
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c2
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c81
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S501
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c293
-rw-r--r--arch/arm64/crypto/poly1305-armv8.pl913
-rw-r--r--arch/arm64/crypto/poly1305-core.S_shipped835
-rw-r--r--arch/arm64/crypto/poly1305-glue.c237
-rw-r--r--arch/arm64/include/asm/asm-uaccess.h25
-rw-r--r--arch/arm64/include/asm/barrier.h12
-rw-r--r--arch/arm64/include/asm/cache.h3
-rw-r--r--arch/arm64/include/asm/cpucaps.h4
-rw-r--r--arch/arm64/include/asm/cpufeature.h14
-rw-r--r--arch/arm64/include/asm/daifflags.h19
-rw-r--r--arch/arm64/include/asm/exception.h22
-rw-r--r--arch/arm64/include/asm/ftrace.h23
-rw-r--r--arch/arm64/include/asm/insn.h3
-rw-r--r--arch/arm64/include/asm/irqflags.h19
-rw-r--r--arch/arm64/include/asm/kvm_arm.h3
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h26
-rw-r--r--arch/arm64/include/asm/kvm_host.h40
-rw-r--r--arch/arm64/include/asm/memory.h6
-rw-r--r--arch/arm64/include/asm/module.h2
-rw-r--r--arch/arm64/include/asm/paravirt.h9
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h16
-rw-r--r--arch/arm64/include/asm/processor.h16
-rw-r--r--arch/arm64/include/asm/pvclock-abi.h17
-rw-r--r--arch/arm64/include/asm/syscall_wrapper.h6
-rw-r--r--arch/arm64/include/asm/traps.h10
-rw-r--r--arch/arm64/include/asm/uaccess.h27
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h5
-rw-r--r--arch/arm64/kernel/Makefile6
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kernel/cpu_errata.c147
-rw-r--r--arch/arm64/kernel/cpufeature.c1
-rw-r--r--arch/arm64/kernel/cpuinfo.c2
-rw-r--r--arch/arm64/kernel/entry-common.c332
-rw-r--r--arch/arm64/kernel/entry-ftrace.S140
-rw-r--r--arch/arm64/kernel/entry.S281
-rw-r--r--arch/arm64/kernel/fpsimd.c6
-rw-r--r--arch/arm64/kernel/ftrace.c123
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c8
-rw-r--r--arch/arm64/kernel/insn.c13
-rw-r--r--arch/arm64/kernel/kaslr.c44
-rw-r--r--arch/arm64/kernel/module-plts.c3
-rw-r--r--arch/arm64/kernel/module.c57
-rw-r--r--arch/arm64/kernel/paravirt.c140
-rw-r--r--arch/arm64/kernel/perf_event.c191
-rw-r--r--arch/arm64/kernel/probes/kprobes.c4
-rw-r--r--arch/arm64/kernel/psci.c15
-rw-r--r--arch/arm64/kernel/sdei.c3
-rw-r--r--arch/arm64/kernel/smp.c11
-rw-r--r--arch/arm64/kernel/sys_compat.c11
-rw-r--r--arch/arm64/kernel/syscall.c4
-rw-r--r--arch/arm64/kernel/time.c3
-rw-r--r--arch/arm64/kernel/traps.c21
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S13
-rw-r--r--arch/arm64/kvm/Kconfig4
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/guest.c23
-rw-r--r--arch/arm64/kvm/handle_exit.c4
-rw-r--r--arch/arm64/kvm/hyp/switch.c52
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c35
-rw-r--r--arch/arm64/kvm/hyp/tlb.c23
-rw-r--r--arch/arm64/kvm/inject_fault.c4
-rw-r--r--arch/arm64/lib/clear_user.S2
-rw-r--r--arch/arm64/lib/copy_from_user.S2
-rw-r--r--arch/arm64/lib/copy_in_user.S2
-rw-r--r--arch/arm64/lib/copy_to_user.S2
-rw-r--r--arch/arm64/lib/uaccess_flushcache.c6
-rw-r--r--arch/arm64/mm/fault.c64
-rw-r--r--arch/arm64/mm/init.c91
-rw-r--r--arch/arm64/mm/mmu.c7
78 files changed, 4051 insertions, 1130 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f047afb982c..afe6412fe769 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -67,7 +67,7 @@ config ARM64
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_SUPPORTS_MEMORY_FAILURE
select ARCH_SUPPORTS_ATOMIC_RMW
- select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
+ select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@@ -143,6 +143,8 @@ config ARM64
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_REGS \
+ if $(cc-option,-fpatchable-function-entry=2)
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_FAST_GUP
select HAVE_FTRACE_MCOUNT_RECORD
@@ -180,7 +182,6 @@ config ARM64
select PCI_SYSCALL if PCI
select POWER_RESET
select POWER_SUPPLY
- select REFCOUNT_FULL
select SPARSE_IRQ
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
@@ -266,6 +267,10 @@ config GENERIC_CSUM
config GENERIC_CALIBRATE_DELAY
def_bool y
+config ZONE_DMA
+ bool "Support DMA zone" if EXPERT
+ default y
+
config ZONE_DMA32
bool "Support DMA32 zone" if EXPERT
default y
@@ -538,6 +543,16 @@ config ARM64_ERRATUM_1286807
invalidated has been observed by other observers. The
workaround repeats the TLBI+DSB operation.
+config ARM64_ERRATUM_1319367
+ bool "Cortex-A57/A72: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
+ default y
+ help
+ This option adds work arounds for ARM Cortex-A57 erratum 1319537
+ and A72 erratum 1319367
+
+ Cortex-A57 and A72 cores could end-up with corrupted TLBs by
+ speculating an AT instruction during a guest context switch.
+
If unsure, say Y.
config ARM64_ERRATUM_1463225
@@ -558,6 +573,22 @@ config ARM64_ERRATUM_1463225
If unsure, say Y.
+config ARM64_ERRATUM_1542419
+ bool "Neoverse-N1: workaround mis-ordering of instruction fetches"
+ default y
+ help
+ This option adds a workaround for ARM Neoverse-N1 erratum
+ 1542419.
+
+ Affected Neoverse-N1 cores could execute a stale instruction when
+ modified by another CPU. The workaround depends on a firmware
+ counterpart.
+
+ Workaround the issue by hiding the DIC feature from EL0. This
+ forces user-space to perform cache maintenance.
+
+ If unsure, say Y.
+
config CAVIUM_ERRATUM_22375
bool "Cavium erratum 22375, 24313"
default y
@@ -845,10 +876,26 @@ config ARM64_PA_BITS
default 48 if ARM64_PA_BITS_48
default 52 if ARM64_PA_BITS_52
+choice
+ prompt "Endianness"
+ default CPU_LITTLE_ENDIAN
+ help
+ Select the endianness of data accesses performed by the CPU. Userspace
+ applications will need to be compiled and linked for the endianness
+ that is selected here.
+
config CPU_BIG_ENDIAN
bool "Build big-endian kernel"
help
- Say Y if you plan on running a kernel in big-endian mode.
+ Say Y if you plan on running a kernel with a big-endian userspace.
+
+config CPU_LITTLE_ENDIAN
+ bool "Build little-endian kernel"
+ help
+ Say Y if you plan on running a kernel with a little-endian userspace.
+ This is usually the case for distributions targeting arm64.
+
+endchoice
config SCHED_MC
bool "Multi-core scheduler support"
@@ -1597,6 +1644,7 @@ config CMDLINE
config CMDLINE_FORCE
bool "Always use the default kernel command string"
+ depends on CMDLINE != ""
help
Always use the default kernel command string, even if the boot
loader passes other arguments to the kernel.
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 2c0238ce0551..1fbe24d4fdb6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -95,6 +95,11 @@ ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
KBUILD_LDS_MODULE += $(srctree)/arch/arm64/kernel/module.lds
endif
+ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
+ KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
+ CC_FLAGS_FTRACE := -fpatchable-function-entry=2
+endif
+
# Default value
head-y := arch/arm64/kernel/head.o
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
index 1d05d570142f..ce4b0679839d 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
@@ -252,6 +252,7 @@
};
&r_ir {
+ linux,rc-map-name = "rc-beelink-gs1";
status = "okay";
};
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 4922c4451e7c..b8eb0453123d 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -86,7 +86,7 @@ config CRYPTO_AES_ARM64_CE_CCM
config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
select CRYPTO_SIMD
@@ -94,7 +94,7 @@ config CRYPTO_AES_ARM64_CE_BLK
config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
select CRYPTO_SIMD
@@ -102,8 +102,15 @@ config CRYPTO_AES_ARM64_NEON_BLK
config CRYPTO_CHACHA20_NEON
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+ tristate "Poly1305 hash function using scalar or NEON instructions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
@@ -113,7 +120,7 @@ config CRYPTO_NHPOLY1305_NEON
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 0435f2a0610e..d0901e610df3 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
+
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
+$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
+ $(call cmd,perlasm)
+
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
+
endif
-clean-files += sha256-core.S sha512-core.S
+clean-files += poly1305-core.S sha256-core.S sha512-core.S
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index ea873b8904c4..e3e27349a9fe 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -384,7 +384,7 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
goto xts_tail;
kernel_neon_end();
- skcipher_walk_done(&walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
if (err || likely(!tail))
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 1495d2b18518..b08029d7bde6 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -1,5 +1,5 @@
/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
@@ -20,9 +20,10 @@
*/
#include <crypto/algapi.h>
-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -36,6 +37,8 @@ asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
@@ -59,6 +62,37 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
}
}
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, stream, nrounds);
+ kernel_neon_end();
+ }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, bytes, nrounds);
+ kernel_neon_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
static int chacha_neon_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
@@ -68,7 +102,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
err = skcipher_walk_virt(&walk, req, false);
- crypto_chacha_init(state, ctx, iv);
+ chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
@@ -76,10 +110,17 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
if (nbytes < walk.total)
nbytes = rounddown(nbytes, walk.stride);
- kernel_neon_begin();
- chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, ctx->nrounds);
- kernel_neon_end();
+ if (!static_branch_likely(&have_neon) ||
+ !crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ kernel_neon_end();
+ }
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@@ -91,9 +132,6 @@ static int chacha_neon(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_chacha_crypt(req);
-
return chacha_neon_stream_xor(req, ctx, req->iv);
}
@@ -105,14 +143,8 @@ static int xchacha_neon(struct skcipher_request *req)
u32 state[16];
u8 real_iv[16];
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_xchacha_crypt(req);
-
- crypto_chacha_init(state, ctx, req->iv);
-
- kernel_neon_begin();
- hchacha_block_neon(state, subctx.key, ctx->nrounds);
- kernel_neon_end();
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_arch(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
@@ -134,7 +166,7 @@ static struct skcipher_alg algs[] = {
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
@@ -150,7 +182,7 @@ static struct skcipher_alg algs[] = {
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
@@ -166,7 +198,7 @@ static struct skcipher_alg algs[] = {
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
+ .setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
@@ -175,14 +207,17 @@ static struct skcipher_alg algs[] = {
static int __init chacha_simd_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
- return -ENODEV;
+ return 0;
+
+ static_branch_enable(&have_neon);
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
}
static void __exit chacha_simd_mod_fini(void)
{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ if (cpu_have_named_feature(ASIMD))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 410e8afcf5a7..a791c4adf8e6 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -13,8 +13,8 @@
T1 .req v2
T2 .req v3
MASK .req v4
- XL .req v5
- XM .req v6
+ XM .req v5
+ XL .req v6
XH .req v7
IN1 .req v7
@@ -358,20 +358,37 @@ ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8
ENDPROC(pmull_ghash_update_p8)
- KS0 .req v12
- KS1 .req v13
- INP0 .req v14
- INP1 .req v15
-
- .macro load_round_keys, rounds, rk
- cmp \rounds, #12
- blo 2222f /* 128 bits */
- beq 1111f /* 192 bits */
- ld1 {v17.4s-v18.4s}, [\rk], #32
-1111: ld1 {v19.4s-v20.4s}, [\rk], #32
-2222: ld1 {v21.4s-v24.4s}, [\rk], #64
- ld1 {v25.4s-v28.4s}, [\rk], #64
- ld1 {v29.4s-v31.4s}, [\rk]
+ KS0 .req v8
+ KS1 .req v9
+ KS2 .req v10
+ KS3 .req v11
+
+ INP0 .req v21
+ INP1 .req v22
+ INP2 .req v23
+ INP3 .req v24
+
+ K0 .req v25
+ K1 .req v26
+ K2 .req v27
+ K3 .req v28
+ K4 .req v12
+ K5 .req v13
+ K6 .req v4
+ K7 .req v5
+ K8 .req v14
+ K9 .req v15
+ KK .req v29
+ KL .req v30
+ KM .req v31
+
+ .macro load_round_keys, rounds, rk, tmp
+ add \tmp, \rk, #64
+ ld1 {K0.4s-K3.4s}, [\rk]
+ ld1 {K4.4s-K5.4s}, [\tmp]
+ add \tmp, \rk, \rounds, lsl #4
+ sub \tmp, \tmp, #32
+ ld1 {KK.4s-KM.4s}, [\tmp]
.endm
.macro enc_round, state, key
@@ -379,197 +396,367 @@ ENDPROC(pmull_ghash_update_p8)
aesmc \state\().16b, \state\().16b
.endm
- .macro enc_block, state, rounds
- cmp \rounds, #12
- b.lo 2222f /* 128 bits */
- b.eq 1111f /* 192 bits */
- enc_round \state, v17
- enc_round \state, v18
-1111: enc_round \state, v19
- enc_round \state, v20
-2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ .macro enc_qround, s0, s1, s2, s3, key
+ enc_round \s0, \key
+ enc_round \s1, \key
+ enc_round \s2, \key
+ enc_round \s3, \key
+ .endm
+
+ .macro enc_block, state, rounds, rk, tmp
+ add \tmp, \rk, #96
+ ld1 {K6.4s-K7.4s}, [\tmp], #32
+ .irp key, K0, K1, K2, K3, K4 K5
enc_round \state, \key
.endr
- aese \state\().16b, v30.16b
- eor \state\().16b, \state\().16b, v31.16b
+
+ tbnz \rounds, #2, .Lnot128_\@
+.Lout256_\@:
+ enc_round \state, K6
+ enc_round \state, K7
+
+.Lout192_\@:
+ enc_round \state, KK
+ aese \state\().16b, KL.16b
+ eor \state\().16b, \state\().16b, KM.16b
+
+ .subsection 1
+.Lnot128_\@:
+ ld1 {K8.4s-K9.4s}, [\tmp], #32
+ enc_round \state, K6
+ enc_round \state, K7
+ ld1 {K6.4s-K7.4s}, [\tmp]
+ enc_round \state, K8
+ enc_round \state, K9
+ tbz \rounds, #1, .Lout192_\@
+ b .Lout256_\@
+ .previous
.endm
+ .align 6
.macro pmull_gcm_do_crypt, enc
- ld1 {SHASH.2d}, [x4], #16
- ld1 {HH.2d}, [x4]
- ld1 {XL.2d}, [x1]
- ldr x8, [x5, #8] // load lower counter
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ str x19, [sp, #24]
+
+ load_round_keys x7, x6, x8
+
+ ld1 {SHASH.2d}, [x3], #16
+ ld1 {HH.2d-HH4.2d}, [x3]
- movi MASK.16b, #0xe1
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d, SHASH.2d, HH.2d
-CPU_LE( rev x8, x8 )
- shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, T1.16b
- .if \enc == 1
- ldr x10, [sp]
- ld1 {KS0.16b-KS1.16b}, [x10]
- .endif
+ trn1 HH34.2d, HH3.2d, HH4.2d
+ trn2 T1.2d, HH3.2d, HH4.2d
+ eor HH34.16b, HH34.16b, T1.16b
- cbnz x6, 4f
+ ld1 {XL.2d}, [x4]
-0: ld1 {INP0.16b-INP1.16b}, [x3], #32
+ cbz x0, 3f // tag only?
- rev x9, x8
- add x11, x8, #1
- add x8, x8, #2
+ ldr w8, [x5, #12] // load lower counter
+CPU_LE( rev w8, w8 )
- .if \enc == 1
- eor INP0.16b, INP0.16b, KS0.16b // encrypt input
- eor INP1.16b, INP1.16b, KS1.16b
+0: mov w9, #4 // max blocks per round
+ add x10, x0, #0xf
+ lsr x10, x10, #4 // remaining blocks
+
+ subs x0, x0, #64
+ csel w9, w10, w9, mi
+ add w8, w8, w9
+
+ bmi 1f
+ ld1 {INP0.16b-INP3.16b}, [x2], #64
+ .subsection 1
+ /*
+ * Populate the four input registers right to left with up to 63 bytes
+ * of data, using overlapping loads to avoid branches.
+ *
+ * INP0 INP1 INP2 INP3
+ * 1 byte | | | |x |
+ * 16 bytes | | | |xxxxxxxx|
+ * 17 bytes | | |xxxxxxxx|x |
+ * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
+ * etc etc
+ *
+ * Note that this code may read up to 15 bytes before the start of
+ * the input. It is up to the calling code to ensure this is safe if
+ * this happens in the first iteration of the loop (i.e., when the
+ * input size is < 16 bytes)
+ */
+1: mov x15, #16
+ ands x19, x0, #0xf
+ csel x19, x19, x15, ne
+ adr_l x17, .Lpermute_table + 16
+
+ sub x11, x15, x19
+ add x12, x17, x11
+ sub x17, x17, x11
+ ld1 {T1.16b}, [x12]
+ sub x10, x1, x11
+ sub x11, x2, x11
+
+ cmp x0, #-16
+ csel x14, x15, xzr, gt
+ cmp x0, #-32
+ csel x15, x15, xzr, gt
+ cmp x0, #-48
+ csel x16, x19, xzr, gt
+ csel x1, x1, x10, gt
+ csel x2, x2, x11, gt
+
+ ld1 {INP0.16b}, [x2], x14
+ ld1 {INP1.16b}, [x2], x15
+ ld1 {INP2.16b}, [x2], x16
+ ld1 {INP3.16b}, [x2]
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ b 2f
+ .previous
+
+2: .if \enc == 0
+ bl pmull_gcm_ghash_4x
.endif
- ld1 {KS0.8b}, [x5] // load upper counter
- rev x11, x11
- sub w0, w0, #2
- mov KS1.8b, KS0.8b
- ins KS0.d[1], x9 // set lower counter
- ins KS1.d[1], x11
+ bl pmull_gcm_enc_4x
- rev64 T1.16b, INP1.16b
+ tbnz x0, #63, 6f
+ st1 {INP0.16b-INP3.16b}, [x1], #64
+ .if \enc == 1
+ bl pmull_gcm_ghash_4x
+ .endif
+ bne 0b
- cmp w7, #12
- b.ge 2f // AES-192/256?
+3: ldp x19, x10, [sp, #24]
+ cbz x10, 5f // output tag?
-1: enc_round KS0, v21
- ext IN1.16b, T1.16b, T1.16b, #8
+ ld1 {INP3.16b}, [x10] // load lengths[]
+ mov w9, #1
+ bl pmull_gcm_ghash_4x
- enc_round KS1, v21
- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
+ mov w11, #(0x1 << 24) // BE '1U'
+ ld1 {KS0.16b}, [x5]
+ mov KS0.s[3], w11
- enc_round KS0, v22
- eor T1.16b, T1.16b, IN1.16b
+ enc_block KS0, x7, x6, x12
- enc_round KS1, v22
- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
+ ext XL.16b, XL.16b, XL.16b, #8
+ rev64 XL.16b, XL.16b
+ eor XL.16b, XL.16b, KS0.16b
+ st1 {XL.16b}, [x10] // store tag
- enc_round KS0, v23
- pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+4: ldp x29, x30, [sp], #32
+ ret
- enc_round KS1, v23
- rev64 T1.16b, INP0.16b
- ext T2.16b, XL.16b, XL.16b, #8
+5:
+CPU_LE( rev w8, w8 )
+ str w8, [x5, #12] // store lower counter
+ st1 {XL.2d}, [x4]
+ b 4b
+
+6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
+ sub x17, x17, x19, lsl #1
+
+ cmp w9, #1
+ beq 7f
+ .subsection 1
+7: ld1 {INP2.16b}, [x1]
+ tbx INP2.16b, {INP3.16b}, T1.16b
+ mov INP3.16b, INP2.16b
+ b 8f
+ .previous
+
+ st1 {INP0.16b}, [x1], x14
+ st1 {INP1.16b}, [x1], x15
+ st1 {INP2.16b}, [x1], x16
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ tbx INP3.16b, {INP2.16b}, T2.16b
+8: st1 {INP3.16b}, [x1]
- enc_round KS0, v24
- ext IN1.16b, T1.16b, T1.16b, #8
- eor T1.16b, T1.16b, T2.16b
+ .if \enc == 1
+ ld1 {T1.16b}, [x17]
+ tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
+ bl pmull_gcm_ghash_4x
+ .endif
+ b 3b
+ .endm
- enc_round KS1, v24
- eor XL.16b, XL.16b, IN1.16b
+ /*
+ * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+ENTRY(pmull_gcm_encrypt)
+ pmull_gcm_do_crypt 1
+ENDPROC(pmull_gcm_encrypt)
- enc_round KS0, v25
- eor T1.16b, T1.16b, XL.16b
+ /*
+ * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+ENTRY(pmull_gcm_decrypt)
+ pmull_gcm_do_crypt 0
+ENDPROC(pmull_gcm_decrypt)
- enc_round KS1, v25
- pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
+pmull_gcm_ghash_4x:
+ movi MASK.16b, #0xe1
+ shl MASK.2d, MASK.2d, #57
- enc_round KS0, v26
- pmull XL.1q, HH.1d, XL.1d // a0 * b0
+ rev64 T1.16b, INP0.16b
+ rev64 T2.16b, INP1.16b
+ rev64 TT3.16b, INP2.16b
+ rev64 TT4.16b, INP3.16b
- enc_round KS1, v26
- pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
+ ext XL.16b, XL.16b, XL.16b, #8
- enc_round KS0, v27
- eor XL.16b, XL.16b, XL2.16b
- eor XH.16b, XH.16b, XH2.16b
+ tbz w9, #2, 0f // <4 blocks?
+ .subsection 1
+0: movi XH2.16b, #0
+ movi XM2.16b, #0
+ movi XL2.16b, #0
- enc_round KS1, v27
- eor XM.16b, XM.16b, XM2.16b
- ext T1.16b, XL.16b, XH.16b, #8
+ tbz w9, #0, 1f // 2 blocks?
+ tbz w9, #1, 2f // 1 block?
- enc_round KS0, v28
- eor T2.16b, XL.16b, XH.16b
- eor XM.16b, XM.16b, T1.16b
+ eor T2.16b, T2.16b, XL.16b
+ ext T1.16b, T2.16b, T2.16b, #8
+ b .Lgh3
- enc_round KS1, v28
- eor XM.16b, XM.16b, T2.16b
+1: eor TT3.16b, TT3.16b, XL.16b
+ ext T2.16b, TT3.16b, TT3.16b, #8
+ b .Lgh2
- enc_round KS0, v29
- pmull T2.1q, XL.1d, MASK.1d
+2: eor TT4.16b, TT4.16b, XL.16b
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+ b .Lgh1
+ .previous
- enc_round KS1, v29
- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
+ eor T1.16b, T1.16b, XL.16b
+ ext IN1.16b, T1.16b, T1.16b, #8
- aese KS0.16b, v30.16b
- eor XL.16b, XM.16b, T2.16b
+ pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
+ eor T1.16b, T1.16b, IN1.16b
+ pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
+ pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
- aese KS1.16b, v30.16b
- ext T2.16b, XL.16b, XL.16b, #8
+ ext T1.16b, T2.16b, T2.16b, #8
+.Lgh3: eor T2.16b, T2.16b, T1.16b
+ pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
+ pmull XL.1q, HH3.1d, T1.1d // a0 * b0
+ pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
- eor KS0.16b, KS0.16b, v31.16b
- pmull XL.1q, XL.1d, MASK.1d
- eor T2.16b, T2.16b, XH.16b
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
- eor KS1.16b, KS1.16b, v31.16b
- eor XL.16b, XL.16b, T2.16b
+ ext T2.16b, TT3.16b, TT3.16b, #8
+.Lgh2: eor TT3.16b, TT3.16b, T2.16b
+ pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
+ pmull XL.1q, HH.1d, T2.1d // a0 * b0
+ pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
- .if \enc == 0
- eor INP0.16b, INP0.16b, KS0.16b
- eor INP1.16b, INP1.16b, KS1.16b
- .endif
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
- st1 {INP0.16b-INP1.16b}, [x2], #32
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
+ pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
+ pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
+ pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
- cbnz w0, 0b
+ eor XH.16b, XH.16b, XH2.16b
+ eor XL.16b, XL.16b, XL2.16b
+ eor XM.16b, XM.16b, XM2.16b
-CPU_LE( rev x8, x8 )
- st1 {XL.2d}, [x1]
- str x8, [x5, #8] // store lower counter
+ eor T2.16b, XL.16b, XH.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor XM.16b, XM.16b, T2.16b
- .if \enc == 1
- st1 {KS0.16b-KS1.16b}, [x10]
- .endif
+ __pmull_reduce_p64
+
+ eor T2.16b, T2.16b, XH.16b
+ eor XL.16b, XL.16b, T2.16b
ret
+ENDPROC(pmull_gcm_ghash_4x)
+
+pmull_gcm_enc_4x:
+ ld1 {KS0.16b}, [x5] // load upper counter
+ sub w10, w8, #4
+ sub w11, w8, #3
+ sub w12, w8, #2
+ sub w13, w8, #1
+ rev w10, w10
+ rev w11, w11
+ rev w12, w12
+ rev w13, w13
+ mov KS1.16b, KS0.16b
+ mov KS2.16b, KS0.16b
+ mov KS3.16b, KS0.16b
+ ins KS0.s[3], w10 // set lower counter
+ ins KS1.s[3], w11
+ ins KS2.s[3], w12
+ ins KS3.s[3], w13
+
+ add x10, x6, #96 // round key pointer
+ ld1 {K6.4s-K7.4s}, [x10], #32
+ .irp key, K0, K1, K2, K3, K4, K5
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
-2: b.eq 3f // AES-192?
- enc_round KS0, v17
- enc_round KS1, v17
- enc_round KS0, v18
- enc_round KS1, v18
-3: enc_round KS0, v19
- enc_round KS1, v19
- enc_round KS0, v20
- enc_round KS1, v20
- b 1b
+ tbnz x7, #2, .Lnot128
+ .subsection 1
+.Lnot128:
+ ld1 {K8.4s-K9.4s}, [x10], #32
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ ld1 {K6.4s-K7.4s}, [x10]
+ .irp key, K8, K9
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ tbz x7, #1, .Lout192
+ b .Lout256
+ .previous
-4: load_round_keys w7, x6
- b 0b
- .endm
+.Lout256:
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
- /*
- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds, u8 ks[])
- */
-ENTRY(pmull_gcm_encrypt)
- pmull_gcm_do_crypt 1
-ENDPROC(pmull_gcm_encrypt)
+.Lout192:
+ enc_qround KS0, KS1, KS2, KS3, KK
- /*
- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds)
- */
-ENTRY(pmull_gcm_decrypt)
- pmull_gcm_do_crypt 0
-ENDPROC(pmull_gcm_decrypt)
+ aese KS0.16b, KL.16b
+ aese KS1.16b, KL.16b
+ aese KS2.16b, KL.16b
+ aese KS3.16b, KL.16b
+
+ eor KS0.16b, KS0.16b, KM.16b
+ eor KS1.16b, KS1.16b, KM.16b
+ eor KS2.16b, KS2.16b, KM.16b
+ eor KS3.16b, KS3.16b, KM.16b
+
+ eor INP0.16b, INP0.16b, KS0.16b
+ eor INP1.16b, INP1.16b, KS1.16b
+ eor INP2.16b, INP2.16b, KS2.16b
+ eor INP3.16b, INP3.16b, KS3.16b
- /*
- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
- */
-ENTRY(pmull_gcm_encrypt_block)
- cbz x2, 0f
- load_round_keys w3, x2
-0: ld1 {v0.16b}, [x1]
- enc_block v0, w3
- st1 {v0.16b}, [x0]
ret
-ENDPROC(pmull_gcm_encrypt_block)
+ENDPROC(pmull_gcm_enc_4x)
+
+ .section ".rodata", "a"
+ .align 6
+.Lpermute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .previous
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 70b1469783f9..522cf004ce65 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -58,17 +58,15 @@ asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head);
-asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
+asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
+ struct ghash_key const *k, u64 dg[],
u8 ctr[], u32 const rk[], int rounds,
- u8 ks[]);
+ u8 tag[]);
-asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
- u8 ctr[], u32 const rk[], int rounds);
-
-asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
- u32 const rk[], int rounds);
+asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
+ struct ghash_key const *k, u64 dg[],
+ u8 ctr[], u32 const rk[], int rounds,
+ u8 tag[]);
static int ghash_init(struct shash_desc *desc)
{
@@ -85,7 +83,7 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head))
{
- if (likely(crypto_simd_usable())) {
+ if (likely(crypto_simd_usable() && simd_update)) {
kernel_neon_begin();
simd_update(blocks, dg, src, key, head);
kernel_neon_end();
@@ -398,136 +396,112 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
}
}
-static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
- u64 dg[], u8 tag[], int cryptlen)
-{
- u8 mac[AES_BLOCK_SIZE];
- u128 lengths;
-
- lengths.a = cpu_to_be64(req->assoclen * 8);
- lengths.b = cpu_to_be64(cryptlen * 8);
-
- ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL,
- pmull_ghash_update_p64);
-
- put_unaligned_be64(dg[1], mac);
- put_unaligned_be64(dg[0], mac + 8);
-
- crypto_xor(tag, mac, AES_BLOCK_SIZE);
-}
-
static int gcm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
+ u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
- u8 ks[2 * AES_BLOCK_SIZE];
- u8 tag[AES_BLOCK_SIZE];
u64 dg[2] = {};
- int nrounds = num_rounds(&ctx->aes_key);
+ u128 lengths;
+ u8 *tag;
int err;
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64(req->cryptlen * 8);
+
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
- if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) {
- u32 const *rk = NULL;
-
- kernel_neon_begin();
- pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
- pmull_gcm_encrypt_block(ks, iv, NULL, nrounds);
- put_unaligned_be32(3, iv + GCM_IV_SIZE);
- pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds);
- put_unaligned_be32(4, iv + GCM_IV_SIZE);
-
+ if (likely(crypto_simd_usable())) {
do {
- int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
+
+ tag = (u8 *)&lengths;
- if (rk)
- kernel_neon_begin();
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
+ }
- pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
- walk.src.virt.addr, &ctx->ghash_key,
- iv, rk, nrounds, ks);
+ kernel_neon_begin();
+ pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg,
+ iv, ctx->aes_key.key_enc, nrounds,
+ tag);
kernel_neon_end();
- err = skcipher_walk_done(&walk,
- walk.nbytes % (2 * AES_BLOCK_SIZE));
+ if (unlikely(!nbytes))
+ break;
- rk = ctx->aes_key.key_enc;
- } while (walk.nbytes >= 2 * AES_BLOCK_SIZE);
- } else {
- aes_encrypt(&ctx->aes_key, tag, iv);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) {
- const int blocks =
- walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
+ } else {
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
int remaining = blocks;
do {
- aes_encrypt(&ctx->aes_key, ks, iv);
- crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
+ aes_encrypt(&ctx->aes_key, buf, iv);
+ crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
crypto_inc(iv, AES_BLOCK_SIZE);
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--remaining > 0);
- ghash_do_update(blocks, dg,
- walk.dst.virt.addr, &ctx->ghash_key,
- NULL, pmull_ghash_update_p64);
+ ghash_do_update(blocks, dg, walk.dst.virt.addr,
+ &ctx->ghash_key, NULL, NULL);
err = skcipher_walk_done(&walk,
- walk.nbytes % (2 * AES_BLOCK_SIZE));
- }
- if (walk.nbytes) {
- aes_encrypt(&ctx->aes_key, ks, iv);
- if (walk.nbytes > AES_BLOCK_SIZE) {
- crypto_inc(iv, AES_BLOCK_SIZE);
- aes_encrypt(&ctx->aes_key, ks + AES_BLOCK_SIZE, iv);
- }
+ walk.nbytes % AES_BLOCK_SIZE);
}
- }
- /* handle the tail */
- if (walk.nbytes) {
- u8 buf[GHASH_BLOCK_SIZE];
- unsigned int nbytes = walk.nbytes;
- u8 *dst = walk.dst.virt.addr;
- u8 *head = NULL;
+ /* handle the tail */
+ if (walk.nbytes) {
+ aes_encrypt(&ctx->aes_key, buf, iv);
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
- walk.nbytes);
+ crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
+ buf, walk.nbytes);
- if (walk.nbytes > GHASH_BLOCK_SIZE) {
- head = dst;
- dst += GHASH_BLOCK_SIZE;
- nbytes %= GHASH_BLOCK_SIZE;
+ memcpy(buf, walk.dst.virt.addr, walk.nbytes);
+ memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
}
- memcpy(buf, dst, nbytes);
- memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
- ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
- pmull_ghash_update_p64);
+ tag = (u8 *)&lengths;
+ ghash_do_update(1, dg, tag, &ctx->ghash_key,
+ walk.nbytes ? buf : NULL, NULL);
- err = skcipher_walk_done(&walk, 0);
+ if (walk.nbytes)
+ err = skcipher_walk_done(&walk, 0);
+
+ put_unaligned_be64(dg[1], tag);
+ put_unaligned_be64(dg[0], tag + 8);
+ put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ aes_encrypt(&ctx->aes_key, iv, iv);
+ crypto_xor(tag, iv, AES_BLOCK_SIZE);
}
if (err)
return err;
- gcm_final(req, ctx, dg, tag, req->cryptlen);
-
/* copy authtag to end of dst */
scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
crypto_aead_authsize(aead), 1);
@@ -540,75 +514,65 @@ static int gcm_decrypt(struct aead_request *req)
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
unsigned int authsize = crypto_aead_authsize(aead);
+ int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
- u8 iv[2 * AES_BLOCK_SIZE];
- u8 tag[AES_BLOCK_SIZE];
- u8 buf[2 * GHASH_BLOCK_SIZE];
+ u8 buf[AES_BLOCK_SIZE];
+ u8 iv[AES_BLOCK_SIZE];
u64 dg[2] = {};
- int nrounds = num_rounds(&ctx->aes_key);
+ u128 lengths;
+ u8 *tag;
int err;
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
+
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
err = skcipher_walk_aead_decrypt(&walk, req, false);
- if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) {
- u32 const *rk = NULL;
-
- kernel_neon_begin();
- pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
+ if (likely(crypto_simd_usable())) {
do {
- int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
- int rem = walk.total - blocks * AES_BLOCK_SIZE;
-
- if (rk)
- kernel_neon_begin();
-
- pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
- walk.src.virt.addr, &ctx->ghash_key,
- iv, rk, nrounds);
-
- /* check if this is the final iteration of the loop */
- if (rem < (2 * AES_BLOCK_SIZE)) {
- u8 *iv2 = iv + AES_BLOCK_SIZE;
-
- if (rem > AES_BLOCK_SIZE) {
- memcpy(iv2, iv, AES_BLOCK_SIZE);
- crypto_inc(iv2, AES_BLOCK_SIZE);
- }
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
- pmull_gcm_encrypt_block(iv, iv, NULL, nrounds);
+ tag = (u8 *)&lengths;
- if (rem > AES_BLOCK_SIZE)
- pmull_gcm_encrypt_block(iv2, iv2, NULL,
- nrounds);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}
+ kernel_neon_begin();
+ pmull_gcm_decrypt(nbytes, dst, src, &ctx->ghash_key, dg,
+ iv, ctx->aes_key.key_enc, nrounds,
+ tag);
kernel_neon_end();
- err = skcipher_walk_done(&walk,
- walk.nbytes % (2 * AES_BLOCK_SIZE));
+ if (unlikely(!nbytes))
+ break;
- rk = ctx->aes_key.key_enc;
- } while (walk.nbytes >= 2 * AES_BLOCK_SIZE);
- } else {
- aes_encrypt(&ctx->aes_key, tag, iv);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) {
- int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
+ } else {
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL,
- pmull_ghash_update_p64);
+ &ctx->ghash_key, NULL, NULL);
do {
aes_encrypt(&ctx->aes_key, buf, iv);
@@ -620,49 +584,38 @@ static int gcm_decrypt(struct aead_request *req)
} while (--blocks > 0);
err = skcipher_walk_done(&walk,
- walk.nbytes % (2 * AES_BLOCK_SIZE));
+ walk.nbytes % AES_BLOCK_SIZE);
}
- if (walk.nbytes) {
- if (walk.nbytes > AES_BLOCK_SIZE) {
- u8 *iv2 = iv + AES_BLOCK_SIZE;
-
- memcpy(iv2, iv, AES_BLOCK_SIZE);
- crypto_inc(iv2, AES_BLOCK_SIZE);
- aes_encrypt(&ctx->aes_key, iv2, iv2);
- }
- aes_encrypt(&ctx->aes_key, iv, iv);
+ /* handle the tail */
+ if (walk.nbytes) {
+ memcpy(buf, walk.src.virt.addr, walk.nbytes);
+ memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
}
- }
- /* handle the tail */
- if (walk.nbytes) {
- const u8 *src = walk.src.virt.addr;
- const u8 *head = NULL;
- unsigned int nbytes = walk.nbytes;
+ tag = (u8 *)&lengths;
+ ghash_do_update(1, dg, tag, &ctx->ghash_key,
+ walk.nbytes ? buf : NULL, NULL);
- if (walk.nbytes > GHASH_BLOCK_SIZE) {
- head = src;
- src += GHASH_BLOCK_SIZE;
- nbytes %= GHASH_BLOCK_SIZE;
- }
+ if (walk.nbytes) {
+ aes_encrypt(&ctx->aes_key, buf, iv);
- memcpy(buf, src, nbytes);
- memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
- ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
- pmull_ghash_update_p64);
+ crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
+ buf, walk.nbytes);
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
- walk.nbytes);
+ err = skcipher_walk_done(&walk, 0);
+ }
- err = skcipher_walk_done(&walk, 0);
+ put_unaligned_be64(dg[1], tag);
+ put_unaligned_be64(dg[0], tag + 8);
+ put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ aes_encrypt(&ctx->aes_key, iv, iv);
+ crypto_xor(tag, iv, AES_BLOCK_SIZE);
}
if (err)
return err;
- gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
-
/* compare calculated auth tag with the stored one */
scatterwalk_map_and_copy(buf, req->src,
req->assoclen + req->cryptlen - authsize,
@@ -675,7 +628,7 @@ static int gcm_decrypt(struct aead_request *req)
static struct aead_alg gcm_aes_alg = {
.ivsize = GCM_IV_SIZE,
- .chunksize = 2 * AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
.maxauthsize = AES_BLOCK_SIZE,
.setkey = gcm_setkey,
.setauthsize = gcm_setauthsize,
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
new file mode 100644
index 000000000000..6e5576d19af8
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -0,0 +1,913 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ mov w#$s1,#-1
+ stp $r0,$r1,[$ctx,#32] // save key value
+ str w#$s1,[$ctx,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+
+ csel $d0,$d0,$r0,eq
+
+# ifdef __ILP32__
+ stp w#$d0,w#$d1,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp $r0,#0 // is_base2_26?
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[$ctx,#48] // first table element
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$inp,#32] // inp[2:3]
+ subs $len,$len,#64
+ ldp x9,x13,[$inp,#48]
+ add $in2,$inp,#96
+ adr $zeros,.Lzeros
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+ ldr x30,[sp,#8]
+ .inst 0xd50323bf // autiasp
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ mov x4,#1
+ st1 {$ACC4}[0],[$ctx]
+ str x4,[$ctx,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+ s/w#x([0-9]+)/w$1/g;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/arch/arm64/crypto/poly1305-core.S_shipped b/arch/arm64/crypto/poly1305-core.S_shipped
new file mode 100644
index 000000000000..8d1c4e420ccd
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-core.S_shipped
@@ -0,0 +1,835 @@
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp x1,xzr
+ stp xzr,xzr,[x0] // zero hash value
+ stp xzr,xzr,[x0,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp x7,x8,[x1] // load key
+ mov x9,#0xfffffffc0fffffff
+ movk x9,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev x7,x7 // flip bytes
+ rev x8,x8
+#endif
+ and x7,x7,x9 // &=0ffffffc0fffffff
+ and x9,x9,#-4
+ and x8,x8,x9 // &=0ffffffc0ffffffc
+ mov w9,#-1
+ stp x7,x8,[x0,#32] // save key value
+ str w9,[x0,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr x12,.Lpoly1305_blocks
+ adr x7,.Lpoly1305_blocks_neon
+ adr x13,.Lpoly1305_emit
+
+ csel x12,x12,x7,eq
+
+# ifdef __ILP32__
+ stp w12,w13,[x2]
+# else
+ stp x12,x13,[x2]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands x2,x2,#-16
+ b.eq .Lno_data
+
+ ldp x4,x5,[x0] // load hash value
+ ldp x6,x17,[x0,#16] // [along with is_base2_26]
+ ldp x7,x8,[x0,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr x12,x4,#32
+ mov w13,w4
+ lsr x14,x5,#32
+ mov w15,w5
+ lsr x16,x6,#32
+#else
+ mov w12,w4
+ lsr x13,x4,#32
+ mov w14,w5
+ lsr x15,x5,#32
+ mov w16,w6
+#endif
+
+ add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
+ lsr x13,x14,#12
+ adds x12,x12,x14,lsl#52
+ add x13,x13,x15,lsl#14
+ adc x13,x13,xzr
+ lsr x14,x16,#24
+ adds x13,x13,x16,lsl#40
+ adc x14,x14,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel x4,x4,x12,eq // choose between radixes
+ csel x5,x5,x13,eq
+ csel x6,x6,x14,eq
+
+.Loop:
+ ldp x10,x11,[x1],#16 // load input
+ sub x2,x2,#16
+#ifdef __AARCH64EB__
+ rev x10,x10
+ rev x11,x11
+#endif
+ adds x4,x4,x10 // accumulate input
+ adcs x5,x5,x11
+
+ mul x12,x4,x7 // h0*r0
+ adc x6,x6,x3
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ cbnz x2,.Loop
+
+ stp x4,x5,[x0] // store hash value
+ stp x6,xzr,[x0,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp x4,x5,[x0] // load hash base 2^64
+ ldp x6,x7,[x0,#16] // [along with is_base2_26]
+ ldp x10,x11,[x2] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr x12,x4,#32
+ mov w13,w4
+ lsr x14,x5,#32
+ mov w15,w5
+ lsr x16,x6,#32
+#else
+ mov w12,w4
+ lsr x13,x4,#32
+ mov w14,w5
+ lsr x15,x5,#32
+ mov w16,w6
+#endif
+
+ add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
+ lsr x13,x14,#12
+ adds x12,x12,x14,lsl#52
+ add x13,x13,x15,lsl#14
+ adc x13,x13,xzr
+ lsr x14,x16,#24
+ adds x13,x13,x16,lsl#40
+ adc x14,x14,xzr
+
+ cmp x7,#0 // is_base2_26?
+ csel x4,x4,x12,eq // choose between radixes
+ csel x5,x5,x13,eq
+ csel x6,x6,x14,eq
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __AARCH64EB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __AARCH64EB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul x12,x4,x7 // h0*r0
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,x4,#26,#26
+ extr x14,x5,x4,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,x5,#14,#26
+ extr x16,x6,x5,#40
+
+ str w12,[x0,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[x0,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[x0,#16*2] // s1
+ str w14,[x0,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[x0,#16*4] // s2
+ str w15,[x0,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[x0,#16*6] // s3
+ str w16,[x0,#16*7] // r4
+ str w15,[x0,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr x17,[x0,#24]
+ cmp x2,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz x17,.Lbase2_64_neon
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ tst x2,#31
+ b.eq .Leven_neon
+
+ ldp x7,x8,[x0,#32] // load key value
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x14,x6,xzr // can be partially reduced...
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp x7,x8,[x0,#32] // load key value
+
+ ldp x4,x5,[x0] // load hash value base 2^64
+ ldr x6,[x0,#16]
+
+ tst x2,#31
+ b.eq .Linit_neon
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[x0,#48] // first table element
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+ ////////////////////////////////// initialize r^n table
+ mov x4,x7 // r^1
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov x5,x8
+ mov x6,xzr
+ add x0,x0,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub x0,x0,#4
+ bl poly1305_splat
+ sub x0,x0,#48 // restore original x0
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+.Ldo_neon:
+ ldp x8,x12,[x1,#32] // inp[2:3]
+ subs x2,x2,#64
+ ldp x9,x13,[x1,#48]
+ add x16,x1,#96
+ adr x17,.Lzeros
+
+ lsl x3,x3,#24
+ add x15,x0,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d14,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d15,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov d16,x8
+ fmov d17,x10
+ fmov d18,x12
+
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ ldp x9,x13,[x1],#48
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+ ld1 {v8.4s},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d9,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d10,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi v31.2d,#-1
+ fmov d11,x8
+ fmov d12,x10
+ fmov d13,x12
+ ushr v31.2d,v31.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // ___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // ___________________/ ____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs x2,x2,#64
+ umull v23.2d,v14.2s,v7.s[2]
+ csel x16,x17,x16,lo
+ umull v22.2d,v14.2s,v5.s[2]
+ umull v21.2d,v14.2s,v3.s[2]
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ umull v20.2d,v14.2s,v1.s[2]
+ ldp x9,x13,[x16],#48
+ umull v19.2d,v14.2s,v0.s[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal v23.2d,v15.2s,v5.s[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v22.2d,v15.2s,v3.s[2]
+ and x5,x9,#0x03ffffff
+ umlal v21.2d,v15.2s,v1.s[2]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v15.2s,v0.s[2]
+ ubfx x7,x9,#26,#26
+ umlal v19.2d,v15.2s,v8.s[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal v23.2d,v16.2s,v3.s[2]
+ extr x8,x12,x8,#52
+ umlal v22.2d,v16.2s,v1.s[2]
+ extr x9,x13,x9,#52
+ umlal v21.2d,v16.2s,v0.s[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v20.2d,v16.2s,v8.s[2]
+ fmov d14,x4
+ umlal v19.2d,v16.2s,v6.s[2]
+ and x8,x8,#0x03ffffff
+
+ umlal v23.2d,v17.2s,v1.s[2]
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v17.2s,v0.s[2]
+ ubfx x10,x12,#14,#26
+ umlal v21.2d,v17.2s,v8.s[2]
+ ubfx x11,x13,#14,#26
+ umlal v20.2d,v17.2s,v6.s[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v19.2d,v17.2s,v4.s[2]
+ fmov d15,x6
+
+ add v11.2s,v11.2s,v26.2s
+ add x12,x3,x12,lsr#40
+ umlal v23.2d,v18.2s,v0.s[2]
+ add x13,x3,x13,lsr#40
+ umlal v22.2d,v18.2s,v8.s[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v21.2d,v18.2s,v6.s[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v18.2s,v4.s[2]
+ fmov d16,x8
+ umlal v19.2d,v18.2s,v2.s[2]
+ fmov d17,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ fmov d18,x12
+ umlal v22.2d,v11.2s,v1.s[0]
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ umlal v19.2d,v11.2s,v6.s[0]
+ ldp x9,x13,[x1],#48
+ umlal v23.2d,v11.2s,v3.s[0]
+ umlal v20.2d,v11.2s,v8.s[0]
+ umlal v21.2d,v11.2s,v0.s[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.s[0]
+ umlal v23.2d,v9.2s,v7.s[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v21.2d,v9.2s,v3.s[0]
+ and x5,x9,#0x03ffffff
+ umlal v19.2d,v9.2s,v0.s[0]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v9.2s,v1.s[0]
+ ubfx x7,x9,#26,#26
+
+ add v12.2s,v12.2s,v27.2s
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal v22.2d,v10.2s,v3.s[0]
+ extr x8,x12,x8,#52
+ umlal v23.2d,v10.2s,v5.s[0]
+ extr x9,x13,x9,#52
+ umlal v19.2d,v10.2s,v8.s[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v21.2d,v10.2s,v1.s[0]
+ fmov d9,x4
+ umlal v20.2d,v10.2s,v0.s[0]
+ and x8,x8,#0x03ffffff
+
+ add v13.2s,v13.2s,v28.2s
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v12.2s,v0.s[0]
+ ubfx x10,x12,#14,#26
+ umlal v19.2d,v12.2s,v4.s[0]
+ ubfx x11,x13,#14,#26
+ umlal v23.2d,v12.2s,v1.s[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v20.2d,v12.2s,v6.s[0]
+ fmov d10,x6
+ umlal v21.2d,v12.2s,v8.s[0]
+ add x12,x3,x12,lsr#40
+
+ umlal v22.2d,v13.2s,v8.s[0]
+ add x13,x3,x13,lsr#40
+ umlal v19.2d,v13.2s,v2.s[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v23.2d,v13.2s,v0.s[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v13.2s,v4.s[0]
+ fmov d11,x8
+ umlal v21.2d,v13.2s,v6.s[0]
+ fmov d12,x10
+ fmov d13,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr v29.2d,v22.2d,#26
+ xtn v27.2s,v22.2d
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ xtn v28.2s,v23.2d
+ ushr v30.2d,v20.2d,#26
+ xtn v25.2s,v20.2d
+ bic v28.2s,#0xfc,lsl#24
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ shrn v30.2s,v21.2d,#26
+ xtn v26.2s,v21.2d
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ bic v25.2s,#0xfc,lsl#24
+ add v27.2s,v27.2s,v30.2s // h2 -> h3
+ bic v26.2s,#0xfc,lsl#24
+
+ shrn v29.2s,v19.2d,#26
+ xtn v24.2s,v19.2d
+ ushr v30.2s,v27.2s,#26
+ bic v27.2s,#0xfc,lsl#24
+ bic v24.2s,#0xfc,lsl#24
+ add v25.2s,v25.2s,v29.2s // h0 -> h1
+ add v28.2s,v28.2s,v30.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup v16.2d,v16.d[0]
+ add v11.2s,v11.2s,v26.2s
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds x2,x2,#32
+ b.ne .Long_tail
+
+ dup v16.2d,v11.d[0]
+ add v14.2s,v9.2s,v24.2s
+ add v17.2s,v12.2s,v27.2s
+ add v15.2s,v10.2s,v25.2s
+ add v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+ dup v14.2d,v14.d[0]
+ umull2 v19.2d,v16.4s,v6.4s
+ umull2 v22.2d,v16.4s,v1.4s
+ umull2 v23.2d,v16.4s,v3.4s
+ umull2 v21.2d,v16.4s,v0.4s
+ umull2 v20.2d,v16.4s,v8.4s
+
+ dup v15.2d,v15.d[0]
+ umlal2 v19.2d,v14.4s,v0.4s
+ umlal2 v21.2d,v14.4s,v3.4s
+ umlal2 v22.2d,v14.4s,v5.4s
+ umlal2 v23.2d,v14.4s,v7.4s
+ umlal2 v20.2d,v14.4s,v1.4s
+
+ dup v17.2d,v17.d[0]
+ umlal2 v19.2d,v15.4s,v8.4s
+ umlal2 v22.2d,v15.4s,v3.4s
+ umlal2 v21.2d,v15.4s,v1.4s
+ umlal2 v23.2d,v15.4s,v5.4s
+ umlal2 v20.2d,v15.4s,v0.4s
+
+ dup v18.2d,v18.d[0]
+ umlal2 v22.2d,v17.4s,v0.4s
+ umlal2 v23.2d,v17.4s,v1.4s
+ umlal2 v19.2d,v17.4s,v4.4s
+ umlal2 v20.2d,v17.4s,v6.4s
+ umlal2 v21.2d,v17.4s,v8.4s
+
+ umlal2 v22.2d,v18.4s,v8.4s
+ umlal2 v19.2d,v18.4s,v2.4s
+ umlal2 v23.2d,v18.4s,v0.4s
+ umlal2 v20.2d,v18.4s,v4.4s
+ umlal2 v21.2d,v18.4s,v6.4s
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ umlal v22.2d,v11.2s,v1.2s
+ umlal v19.2d,v11.2s,v6.2s
+ umlal v23.2d,v11.2s,v3.2s
+ umlal v20.2d,v11.2s,v8.2s
+ umlal v21.2d,v11.2s,v0.2s
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.2s
+ umlal v19.2d,v9.2s,v0.2s
+ umlal v23.2d,v9.2s,v7.2s
+ umlal v20.2d,v9.2s,v1.2s
+ umlal v21.2d,v9.2s,v3.2s
+
+ add v12.2s,v12.2s,v27.2s
+ umlal v22.2d,v10.2s,v3.2s
+ umlal v19.2d,v10.2s,v8.2s
+ umlal v23.2d,v10.2s,v5.2s
+ umlal v20.2d,v10.2s,v0.2s
+ umlal v21.2d,v10.2s,v1.2s
+
+ add v13.2s,v13.2s,v28.2s
+ umlal v22.2d,v12.2s,v0.2s
+ umlal v19.2d,v12.2s,v4.2s
+ umlal v23.2d,v12.2s,v1.2s
+ umlal v20.2d,v12.2s,v6.2s
+ umlal v21.2d,v12.2s,v8.2s
+
+ umlal v22.2d,v13.2s,v8.2s
+ umlal v19.2d,v13.2s,v2.2s
+ umlal v23.2d,v13.2s,v0.2s
+ umlal v20.2d,v13.2s,v4.2s
+ umlal v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp v22.2d,v22.2d,v22.2d
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp v19.2d,v19.2d,v19.2d
+ ldp d10,d11,[sp,#32]
+ addp v23.2d,v23.2d,v23.2d
+ ldp d12,d13,[sp,#48]
+ addp v20.2d,v20.2d,v20.2d
+ ldp d14,d15,[sp,#64]
+ addp v21.2d,v21.2d,v21.2d
+ ldr x30,[sp,#8]
+ .inst 0xd50323bf // autiasp
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr v29.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ and v23.16b,v23.16b,v31.16b
+ ushr v30.2d,v20.2d,#26
+ and v20.16b,v20.16b,v31.16b
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ ushr v30.2d,v21.2d,#26
+ and v21.16b,v21.16b,v31.16b
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ add v22.2d,v22.2d,v30.2d // h2 -> h3
+
+ ushr v29.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ ushr v30.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ add v20.2d,v20.2d,v29.2d // h0 -> h1
+ add v23.2d,v23.2d,v30.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+ mov x4,#1
+ st1 {v23.s}[0],[x0]
+ str x4,[x0,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..dd843d0ee83a
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/jump_label.h>
+#include <linux/module.h>
+
+asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
+asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+{
+ poly1305_init_arm64(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(key + 16);
+ dctx->s[1] = get_unaligned_le32(key + 20);
+ dctx->s[2] = get_unaligned_le32(key + 24);
+ dctx->s[3] = get_unaligned_le32(key + 28);
+ dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+static int neon_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ u32 len, u32 hibit, bool do_neon)
+{
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset) {
+ poly1305_init_arch(dctx, src);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ if (len < POLY1305_BLOCK_SIZE)
+ return;
+ }
+
+ len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+ if (static_branch_likely(&have_neon) && likely(do_neon))
+ poly1305_blocks_neon(&dctx->h, src, len, hibit);
+ else
+ poly1305_blocks(&dctx->h, src, len, hibit);
+}
+
+static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
+ const u8 *src, u32 len, bool do_neon)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ len -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ neon_poly1305_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1, false);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
+ neon_poly1305_blocks(dctx, src, len, 1, do_neon);
+ src += round_down(len, POLY1305_BLOCK_SIZE);
+ len %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(len)) {
+ dctx->buflen = len;
+ memcpy(dctx->buf, src, len);
+ }
+}
+
+static int neon_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ bool do_neon = crypto_simd_usable() && srclen > 128;
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_begin();
+ neon_poly1305_do_update(dctx, src, srclen, do_neon);
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_end();
+ return 0;
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int nbytes)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, len, 1);
+ kernel_neon_end();
+ } else {
+ poly1305_blocks(&dctx->h, src, len, 1);
+ }
+ src += len;
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ dctx->buflen = nbytes;
+ memcpy(dctx->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ __le32 digest[4];
+ u64 f = 0;
+
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit(&dctx->h, digest, dctx->s);
+
+ /* mac = (h + s) % (2^128) */
+ f = (f >> 32) + le32_to_cpu(digest[0]);
+ put_unaligned_le32(f, dst);
+ f = (f >> 32) + le32_to_cpu(digest[1]);
+ put_unaligned_le32(f, dst + 4);
+ f = (f >> 32) + le32_to_cpu(digest[2]);
+ put_unaligned_le32(f, dst + 8);
+ f = (f >> 32) + le32_to_cpu(digest[3]);
+ put_unaligned_le32(f, dst + 12);
+
+ *dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
+ return 0;
+}
+
+static struct shash_alg neon_poly1305_alg = {
+ .init = neon_poly1305_init,
+ .update = neon_poly1305_update,
+ .final = neon_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init neon_poly1305_mod_init(void)
+{
+ if (!cpu_have_named_feature(ASIMD))
+ return 0;
+
+ static_branch_enable(&have_neon);
+
+ return crypto_register_shash(&neon_poly1305_alg);
+}
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+ if (cpu_have_named_feature(ASIMD))
+ crypto_unregister_shash(&neon_poly1305_alg);
+}
+
+module_init(neon_poly1305_mod_init);
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 5bf963830b17..f68a0e64482a 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -58,29 +58,4 @@ alternative_else_nop_endif
.endm
#endif
-/*
- * These macros are no-ops when UAO is present.
- */
- .macro uaccess_disable_not_uao, tmp1, tmp2
- uaccess_ttbr0_disable \tmp1, \tmp2
-alternative_if ARM64_ALT_PAN_NOT_UAO
- SET_PSTATE_PAN(1)
-alternative_else_nop_endif
- .endm
-
- .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3
- uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3
-alternative_if ARM64_ALT_PAN_NOT_UAO
- SET_PSTATE_PAN(0)
-alternative_else_nop_endif
- .endm
-
-/*
- * Remove the address tag from a virtual address, if present.
- */
- .macro untagged_addr, dst, addr
- sbfx \dst, \addr, #0, #56
- and \dst, \dst, \addr
- .endm
-
#endif
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index e0e2b1946f42..7d9cc5ec4971 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -29,6 +29,18 @@
SB_BARRIER_INSN"nop\n", \
ARM64_HAS_SB))
+#ifdef CONFIG_ARM64_PSEUDO_NMI
+#define pmr_sync() \
+ do { \
+ extern struct static_key_false gic_pmr_sync; \
+ \
+ if (static_branch_unlikely(&gic_pmr_sync)) \
+ dsb(sy); \
+ } while(0)
+#else
+#define pmr_sync() do {} while (0)
+#endif
+
#define mb() dsb(sy)
#define rmb() dsb(ld)
#define wmb() dsb(st)
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 43da6dd29592..806e9dc2a852 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -11,6 +11,7 @@
#define CTR_L1IP_MASK 3
#define CTR_DMINLINE_SHIFT 16
#define CTR_IMINLINE_SHIFT 0
+#define CTR_IMINLINE_MASK 0xf
#define CTR_ERG_SHIFT 20
#define CTR_CWG_SHIFT 24
#define CTR_CWG_MASK 15
@@ -18,7 +19,7 @@
#define CTR_DIC_SHIFT 29
#define CTR_CACHE_MINLINE_MASK \
- (0xf << CTR_DMINLINE_SHIFT | 0xf << CTR_IMINLINE_SHIFT)
+ (0xf << CTR_DMINLINE_SHIFT | CTR_IMINLINE_MASK << CTR_IMINLINE_SHIFT)
#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ac1dbca3d0cd..b92683871119 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -54,7 +54,9 @@
#define ARM64_WORKAROUND_1463225 44
#define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45
#define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46
+#define ARM64_WORKAROUND_1542419 47
+#define ARM64_WORKAROUND_1319367 48
-#define ARM64_NCAPS 47
+#define ARM64_NCAPS 49
#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 9cde5d2e768f..4261d55e8506 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -659,6 +659,20 @@ static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
default: return CONFIG_ARM64_PA_BITS;
}
}
+
+/* Check whether hardware update of the Access flag is supported */
+static inline bool cpu_has_hw_af(void)
+{
+ u64 mmfr1;
+
+ if (!IS_ENABLED(CONFIG_ARM64_HW_AFDBM))
+ return false;
+
+ mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
+ return cpuid_feature_extract_unsigned_field(mmfr1,
+ ID_AA64MMFR1_HADBS_SHIFT);
+}
+
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h
index 063c964af705..72acd2db167f 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -8,7 +8,9 @@
#include <linux/irqflags.h>
#include <asm/arch_gicv3.h>
+#include <asm/barrier.h>
#include <asm/cpufeature.h>
+#include <asm/ptrace.h>
#define DAIF_PROCCTX 0
#define DAIF_PROCCTX_NOIRQ PSR_I_BIT
@@ -65,7 +67,7 @@ static inline void local_daif_restore(unsigned long flags)
if (system_uses_irq_prio_masking()) {
gic_write_pmr(GIC_PRIO_IRQON);
- dsb(sy);
+ pmr_sync();
}
} else if (system_uses_irq_prio_masking()) {
u64 pmr;
@@ -109,4 +111,19 @@ static inline void local_daif_restore(unsigned long flags)
trace_hardirqs_off();
}
+/*
+ * Called by synchronous exception handlers to restore the DAIF bits that were
+ * modified by taking an exception.
+ */
+static inline void local_daif_inherit(struct pt_regs *regs)
+{
+ unsigned long flags = regs->pstate & DAIF_MASK;
+
+ /*
+ * We can't use local_daif_restore(regs->pstate) here as
+ * system_has_prio_mask_debugging() won't restore the I bit if it can
+ * use the pmr instead.
+ */
+ write_sysreg(flags, daif);
+}
#endif
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index a17393ff6677..4d5f3b5f50cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -8,14 +8,15 @@
#define __ASM_EXCEPTION_H
#include <asm/esr.h>
+#include <asm/kprobes.h>
+#include <asm/ptrace.h>
#include <linux/interrupt.h>
-#define __exception __attribute__((section(".exception.text")))
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#define __exception_irq_entry __irq_entry
#else
-#define __exception_irq_entry __exception
+#define __exception_irq_entry __kprobes
#endif
static inline u32 disr_to_esr(u64 disr)
@@ -31,5 +32,22 @@ static inline u32 disr_to_esr(u64 disr)
}
asmlinkage void enter_from_user_mode(void);
+void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_undefinstr(struct pt_regs *regs);
+asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
+void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
+ struct pt_regs *regs);
+void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs);
+void do_sve_acc(unsigned int esr, struct pt_regs *regs);
+void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs);
+void do_sysinstr(unsigned int esr, struct pt_regs *regs);
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr);
+void do_cp15instr(unsigned int esr, struct pt_regs *regs);
+void el0_svc_handler(struct pt_regs *regs);
+void el0_svc_compat_handler(struct pt_regs *regs);
+void do_el0_ia_bp_hardening(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs);
#endif /* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index d48667b04c41..91fa4baa1a93 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -11,9 +11,20 @@
#include <asm/insn.h>
#define HAVE_FUNCTION_GRAPH_FP_TEST
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#else
#define MCOUNT_ADDR ((unsigned long)_mcount)
+#endif
+
+/* The BL at the callsite's adjusted rec->ip */
#define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE
+#define FTRACE_PLT_IDX 0
+#define FTRACE_REGS_PLT_IDX 1
+#define NR_FTRACE_PLTS 2
+
/*
* Currently, gcc tends to save the link register after the local variables
* on the stack. This causes the max stack tracer to report the function
@@ -44,12 +55,24 @@ extern void return_to_handler(void);
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/*
+ * Adjust addr to point at the BL in the callsite.
+ * See ftrace_init_nop() for the callsite sequence.
+ */
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
+ return addr + AARCH64_INSN_SIZE;
+ /*
* addr is the address of the mcount call instruction.
* recordmcount does the necessary offset calculation.
*/
return addr;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+struct dyn_ftrace;
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
+#define ftrace_init_nop ftrace_init_nop
+#endif
+
#define ftrace_return_address(n) return_address(n)
/*
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 39e7780bedd6..bb313dde58a4 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -440,6 +440,9 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
int shift,
enum aarch64_insn_variant variant,
enum aarch64_insn_logic_type type);
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant);
u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
enum aarch64_insn_variant variant,
enum aarch64_insn_register Rn,
diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index 1a59f0ed1ae3..aa4b6521ef14 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -6,6 +6,7 @@
#define __ASM_IRQFLAGS_H
#include <asm/alternative.h>
+#include <asm/barrier.h>
#include <asm/ptrace.h>
#include <asm/sysreg.h>
@@ -34,14 +35,14 @@ static inline void arch_local_irq_enable(void)
}
asm volatile(ALTERNATIVE(
- "msr daifclr, #2 // arch_local_irq_enable\n"
- "nop",
- __msr_s(SYS_ICC_PMR_EL1, "%0")
- "dsb sy",
+ "msr daifclr, #2 // arch_local_irq_enable",
+ __msr_s(SYS_ICC_PMR_EL1, "%0"),
ARM64_HAS_IRQ_PRIO_MASKING)
:
: "r" ((unsigned long) GIC_PRIO_IRQON)
: "memory");
+
+ pmr_sync();
}
static inline void arch_local_irq_disable(void)
@@ -116,14 +117,14 @@ static inline unsigned long arch_local_irq_save(void)
static inline void arch_local_irq_restore(unsigned long flags)
{
asm volatile(ALTERNATIVE(
- "msr daif, %0\n"
- "nop",
- __msr_s(SYS_ICC_PMR_EL1, "%0")
- "dsb sy",
- ARM64_HAS_IRQ_PRIO_MASKING)
+ "msr daif, %0",
+ __msr_s(SYS_ICC_PMR_EL1, "%0"),
+ ARM64_HAS_IRQ_PRIO_MASKING)
:
: "r" (flags)
: "memory");
+
+ pmr_sync();
}
#endif /* __ASM_IRQFLAGS_H */
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index ddf9d762ac62..6e5d839f42b5 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -61,7 +61,6 @@
* RW: 64bit by default, can be overridden for 32bit VMs
* TAC: Trap ACTLR
* TSC: Trap SMC
- * TVM: Trap VM ops (until M+C set in SCTLR_EL1)
* TSW: Trap cache operations by set/way
* TWE: Trap WFE
* TWI: Trap WFI
@@ -74,7 +73,7 @@
* SWIO: Turn set/way invalidates into set/way clean+invalidate
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
- HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
+ HCR_BSU_IS | HCR_FB | HCR_TAC | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index d69c1efc63e7..5efe5ca8fecf 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -53,8 +53,18 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
/* trap error record accesses */
vcpu->arch.hcr_el2 |= HCR_TERR;
}
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
vcpu->arch.hcr_el2 |= HCR_FWB;
+ } else {
+ /*
+ * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
+ * get set in SCTLR_EL1 such that we can detect when the guest
+ * MMU gets turned on and do the necessary cache maintenance
+ * then.
+ */
+ vcpu->arch.hcr_el2 |= HCR_TVM;
+ }
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
vcpu->arch.hcr_el2 &= ~HCR_RW;
@@ -77,14 +87,19 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
return (unsigned long *)&vcpu->arch.hcr_el2;
}
-static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 &= ~HCR_TWE;
+ if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count))
+ vcpu->arch.hcr_el2 &= ~HCR_TWI;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TWI;
}
-static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 |= HCR_TWE;
+ vcpu->arch.hcr_el2 |= HCR_TWI;
}
static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu)
@@ -258,6 +273,11 @@ static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu)
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV);
}
+static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_get_hsr(vcpu) & (ESR_ELx_CM | ESR_ELx_WNR | ESR_ELx_FSC);
+}
+
static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f656169db8c3..c61260cf63c5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -44,6 +44,7 @@
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
+#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
@@ -83,6 +84,14 @@ struct kvm_arch {
/* Mandated version of PSCI */
u32 psci_version;
+
+ /*
+ * If we encounter a data abort without valid instruction syndrome
+ * information, report this to user space. User space can (and
+ * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
+ * supported.
+ */
+ bool return_nisv_io_abort_to_user;
};
#define KVM_NR_MEM_OBJS 40
@@ -338,6 +347,13 @@ struct kvm_vcpu_arch {
/* True when deferrable sysregs are loaded on the physical CPU,
* see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
bool sysregs_loaded_on_cpu;
+
+ /* Guest PV state */
+ struct {
+ u64 steal;
+ u64 last_steal;
+ gpa_t base;
+ } steal;
};
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -478,6 +494,27 @@ void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
int kvm_perf_init(void);
int kvm_perf_teardown(void);
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu);
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+
+static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
+{
+ vcpu_arch->steal.base = GPA_INVALID;
+}
+
+static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
+{
+ return (vcpu_arch->steal.base != GPA_INVALID);
+}
+
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
@@ -600,8 +637,7 @@ static inline void kvm_arm_vhe_guest_enter(void)
* local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
* dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
*/
- if (system_uses_irq_prio_masking())
- dsb(sy);
+ pmr_sync();
}
static inline void kvm_arm_vhe_guest_exit(void)
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index c23c47360664..a4f9ca5479b0 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -69,12 +69,6 @@
#define KERNEL_START _text
#define KERNEL_END _end
-#ifdef CONFIG_ARM64_VA_BITS_52
-#define MAX_USER_VA_BITS 52
-#else
-#define MAX_USER_VA_BITS VA_BITS
-#endif
-
/*
* Generic and tag-based KASAN require 1/8th and 1/16th of the kernel virtual
* address space for the shadow region respectively. They can bloat the stack
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index f80e13cbf8ec..1e93de68c044 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -21,7 +21,7 @@ struct mod_arch_specific {
struct mod_plt_sec init;
/* for CONFIG_DYNAMIC_FTRACE */
- struct plt_entry *ftrace_trampoline;
+ struct plt_entry *ftrace_trampolines;
};
#endif
diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
index 799d9dd6f7cc..cf3a0fd7c1a7 100644
--- a/arch/arm64/include/asm/paravirt.h
+++ b/arch/arm64/include/asm/paravirt.h
@@ -21,6 +21,13 @@ static inline u64 paravirt_steal_clock(int cpu)
{
return pv_ops.time.steal_clock(cpu);
}
-#endif
+
+int __init pv_time_init(void);
+
+#else
+
+#define pv_time_init() do {} while (0)
+
+#endif // CONFIG_PARAVIRT
#endif
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 3df60f97da1f..d9fbd433cc17 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -69,7 +69,7 @@
#define PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - CONFIG_PGTABLE_LEVELS)
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD (1 << (MAX_USER_VA_BITS - PGDIR_SHIFT))
+#define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT))
/*
* Section address mask and size definitions.
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 565aa45ef134..5d15b4735a0e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -17,7 +17,7 @@
* VMALLOC range.
*
* VMALLOC_START: beginning of the kernel vmalloc space
- * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space
+ * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space
* and fixed mappings
*/
#define VMALLOC_START (MODULES_END)
@@ -865,6 +865,20 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#define phys_to_ttbr(addr) (addr)
#endif
+/*
+ * On arm64 without hardware Access Flag, copying from user will fail because
+ * the pte is old and cannot be marked young. So we always end up with zeroed
+ * page after fork() + CoW for pfn mappings. We don't always have a
+ * hardware-managed access flag on arm64.
+ */
+static inline bool arch_faults_on_old_pte(void)
+{
+ WARN_ON(preemptible());
+
+ return !cpu_has_hw_af();
+}
+#define arch_faults_on_old_pte arch_faults_on_old_pte
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5623685c7d13..5ba63204d078 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -9,7 +9,7 @@
#define __ASM_PROCESSOR_H
#define KERNEL_DS UL(-1)
-#define USER_DS ((UL(1) << MAX_USER_VA_BITS) - 1)
+#define USER_DS ((UL(1) << VA_BITS) - 1)
/*
* On arm64 systems, unaligned accesses by the CPU are cheap, and so there is
@@ -26,10 +26,12 @@
#include <linux/init.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <linux/thread_info.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/hw_breakpoint.h>
+#include <asm/kasan.h>
#include <asm/lse.h>
#include <asm/pgtable-hwdef.h>
#include <asm/pointer_auth.h>
@@ -214,6 +216,18 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc,
regs->sp = sp;
}
+static inline bool is_ttbr0_addr(unsigned long addr)
+{
+ /* entry assembly clears tags for TTBR0 addrs */
+ return addr < TASK_SIZE;
+}
+
+static inline bool is_ttbr1_addr(unsigned long addr)
+{
+ /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
+ return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
+}
+
#ifdef CONFIG_COMPAT
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
diff --git a/arch/arm64/include/asm/pvclock-abi.h b/arch/arm64/include/asm/pvclock-abi.h
new file mode 100644
index 000000000000..c4f1c0a0789c
--- /dev/null
+++ b/arch/arm64/include/asm/pvclock-abi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 Arm Ltd. */
+
+#ifndef __ASM_PVCLOCK_ABI_H
+#define __ASM_PVCLOCK_ABI_H
+
+/* The below structure is defined in ARM DEN0057A */
+
+struct pvclock_vcpu_stolen_time {
+ __le32 revision;
+ __le32 attributes;
+ __le64 stolen_time;
+ /* Structure must be 64 byte aligned, pad to that size */
+ u8 padding[48];
+} __packed;
+
+#endif
diff --git a/arch/arm64/include/asm/syscall_wrapper.h b/arch/arm64/include/asm/syscall_wrapper.h
index 06d880b3526c..b383b4802a7b 100644
--- a/arch/arm64/include/asm/syscall_wrapper.h
+++ b/arch/arm64/include/asm/syscall_wrapper.h
@@ -66,24 +66,18 @@ struct pt_regs;
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
-#ifndef SYSCALL_DEFINE0
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long __arm64_sys_##sname(const struct pt_regs *__unused); \
ALLOW_ERROR_INJECTION(__arm64_sys_##sname, ERRNO); \
asmlinkage long __arm64_sys_##sname(const struct pt_regs *__unused)
-#endif
-#ifndef COND_SYSCALL
#define COND_SYSCALL(name) \
asmlinkage long __weak __arm64_sys_##name(const struct pt_regs *regs) \
{ \
return sys_ni_syscall(); \
}
-#endif
-#ifndef SYS_NI
#define SYS_NI(name) SYSCALL_ALIAS(__arm64_sys_##name, sys_ni_posix_timers);
-#endif
#endif /* __ASM_SYSCALL_WRAPPER_H */
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 59690613ac31..cee5928e1b7d 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -42,16 +42,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
ptr < (unsigned long)&__irqentry_text_end;
}
-static inline int in_exception_text(unsigned long ptr)
-{
- int in;
-
- in = ptr >= (unsigned long)&__exception_text_start &&
- ptr < (unsigned long)&__exception_text_end;
-
- return in ? : __in_irqentry_text(ptr);
-}
-
static inline int in_entry_text(unsigned long ptr)
{
return ptr >= (unsigned long)&__entry_text_start &&
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 097d6bfac0b7..127712b0b970 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -378,20 +378,34 @@ do { \
extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
#define raw_copy_from_user(to, from, n) \
({ \
- __arch_copy_from_user((to), __uaccess_mask_ptr(from), (n)); \
+ unsigned long __acfu_ret; \
+ uaccess_enable_not_uao(); \
+ __acfu_ret = __arch_copy_from_user((to), \
+ __uaccess_mask_ptr(from), (n)); \
+ uaccess_disable_not_uao(); \
+ __acfu_ret; \
})
extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
#define raw_copy_to_user(to, from, n) \
({ \
- __arch_copy_to_user(__uaccess_mask_ptr(to), (from), (n)); \
+ unsigned long __actu_ret; \
+ uaccess_enable_not_uao(); \
+ __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), \
+ (from), (n)); \
+ uaccess_disable_not_uao(); \
+ __actu_ret; \
})
extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n);
#define raw_copy_in_user(to, from, n) \
({ \
- __arch_copy_in_user(__uaccess_mask_ptr(to), \
- __uaccess_mask_ptr(from), (n)); \
+ unsigned long __aciu_ret; \
+ uaccess_enable_not_uao(); \
+ __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \
+ __uaccess_mask_ptr(from), (n)); \
+ uaccess_disable_not_uao(); \
+ __aciu_ret; \
})
#define INLINE_COPY_TO_USER
@@ -400,8 +414,11 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi
extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n);
static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n)
{
- if (access_ok(to, n))
+ if (access_ok(to, n)) {
+ uaccess_enable_not_uao();
n = __arch_clear_user(__uaccess_mask_ptr(to), n);
+ uaccess_disable_not_uao();
+ }
return n;
}
#define clear_user __clear_user
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 67c21f9bdbad..820e5751ada7 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -164,8 +164,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+ __u8 ext_dabt_pending;
/* Align it to 8 bytes */
- __u8 pad[6];
+ __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
@@ -323,6 +324,8 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
+#define KVM_ARM_VCPU_PVTIME_CTRL 2
+#define KVM_ARM_VCPU_PVTIME_IPA 0
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_VCPU2_SHIFT 28
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 478491f07b4f..fc6488660f64 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -13,9 +13,9 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
# Object file lists.
obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
- entry-fpsimd.o process.o ptrace.o setup.o signal.o \
- sys.o stacktrace.o time.o traps.o io.o vdso.o \
- hyp-stub.o psci.o cpu_ops.o insn.o \
+ entry-common.o entry-fpsimd.o process.o ptrace.o \
+ setup.o signal.o sys.o stacktrace.o time.o traps.o \
+ io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o \
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 214685760e1c..a5bdce8af65b 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -56,6 +56,7 @@ int main(void)
DEFINE(S_X24, offsetof(struct pt_regs, regs[24]));
DEFINE(S_X26, offsetof(struct pt_regs, regs[26]));
DEFINE(S_X28, offsetof(struct pt_regs, regs[28]));
+ DEFINE(S_FP, offsetof(struct pt_regs, regs[29]));
DEFINE(S_LR, offsetof(struct pt_regs, regs[30]));
DEFINE(S_SP, offsetof(struct pt_regs, sp));
DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate));
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 93f34b4eca25..6a09ca7644ea 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -6,7 +6,6 @@
*/
#include <linux/arm-smccc.h>
-#include <linux/psci.h>
#include <linux/types.h>
#include <linux/cpu.h>
#include <asm/cpu.h>
@@ -88,13 +87,21 @@ has_mismatched_cache_type(const struct arm64_cpu_capabilities *entry,
}
static void
-cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused)
+cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
{
u64 mask = arm64_ftr_reg_ctrel0.strict_mask;
+ bool enable_uct_trap = false;
/* Trap CTR_EL0 access on this CPU, only if it has a mismatch */
if ((read_cpuid_cachetype() & mask) !=
(arm64_ftr_reg_ctrel0.sys_val & mask))
+ enable_uct_trap = true;
+
+ /* ... or if the system is affected by an erratum */
+ if (cap->capability == ARM64_WORKAROUND_1542419)
+ enable_uct_trap = true;
+
+ if (enable_uct_trap)
sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}
@@ -167,9 +174,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn,
}
#endif /* CONFIG_KVM_INDIRECT_VECTORS */
-#include <uapi/linux/psci.h>
#include <linux/arm-smccc.h>
-#include <linux/psci.h>
static void call_smc_arch_workaround_1(void)
{
@@ -213,43 +218,31 @@ static int detect_harden_bp_fw(void)
struct arm_smccc_res res;
u32 midr = read_cpuid_id();
- if (psci_ops.smccc_version == SMCCC_VERSION_1_0)
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+
+ switch ((int)res.a0) {
+ case 1:
+ /* Firmware says we're just fine */
+ return 0;
+ case 0:
+ break;
+ default:
return -1;
+ }
- switch (psci_ops.conduit) {
- case PSCI_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- cb = call_hvc_arch_workaround_1;
- /* This is a guest, no need to patch KVM vectors */
- smccc_start = NULL;
- smccc_end = NULL;
- break;
- default:
- return -1;
- }
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
+ cb = call_hvc_arch_workaround_1;
+ /* This is a guest, no need to patch KVM vectors */
+ smccc_start = NULL;
+ smccc_end = NULL;
break;
- case PSCI_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- cb = call_smc_arch_workaround_1;
- smccc_start = __smccc_workaround_1_smc_start;
- smccc_end = __smccc_workaround_1_smc_end;
- break;
- default:
- return -1;
- }
+ case SMCCC_CONDUIT_SMC:
+ cb = call_smc_arch_workaround_1;
+ smccc_start = __smccc_workaround_1_smc_start;
+ smccc_end = __smccc_workaround_1_smc_end;
break;
default:
@@ -309,11 +302,11 @@ void __init arm64_update_smccc_conduit(struct alt_instr *alt,
BUG_ON(nr_inst != 1);
- switch (psci_ops.conduit) {
- case PSCI_CONDUIT_HVC:
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
insn = aarch64_insn_get_hvc_value();
break;
- case PSCI_CONDUIT_SMC:
+ case SMCCC_CONDUIT_SMC:
insn = aarch64_insn_get_smc_value();
break;
default:
@@ -339,6 +332,8 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt,
void arm64_set_ssbd_mitigation(bool state)
{
+ int conduit;
+
if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
pr_info_once("SSBD disabled by kernel configuration\n");
return;
@@ -352,19 +347,10 @@ void arm64_set_ssbd_mitigation(bool state)
return;
}
- switch (psci_ops.conduit) {
- case PSCI_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
- break;
-
- case PSCI_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
- break;
+ conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
+ NULL);
- default:
- WARN_ON_ONCE(1);
- break;
- }
+ WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
}
static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
@@ -374,6 +360,7 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
bool required = true;
s32 val;
bool this_cpu_safe = false;
+ int conduit;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
@@ -391,25 +378,10 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
goto out_printmsg;
}
- if (psci_ops.smccc_version == SMCCC_VERSION_1_0) {
- ssbd_state = ARM64_SSBD_UNKNOWN;
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
- }
-
- switch (psci_ops.conduit) {
- case PSCI_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- break;
-
- case PSCI_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- break;
+ conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- default:
+ if (conduit == SMCCC_CONDUIT_NONE) {
ssbd_state = ARM64_SSBD_UNKNOWN;
if (!this_cpu_safe)
__ssb_safe = false;
@@ -650,9 +622,21 @@ needs_tx2_tvm_workaround(const struct arm64_cpu_capabilities *entry,
return false;
}
-#ifdef CONFIG_HARDEN_EL2_VECTORS
+static bool __maybe_unused
+has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry,
+ int scope)
+{
+ u32 midr = read_cpuid_id();
+ bool has_dic = read_cpuid_cachetype() & BIT(CTR_DIC_SHIFT);
+ const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1);
+
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+ return is_midr_in_range(midr, &range) && has_dic;
+}
+
+#if defined(CONFIG_HARDEN_EL2_VECTORS) || defined(CONFIG_ARM64_ERRATUM_1319367)
-static const struct midr_range arm64_harden_el2_vectors[] = {
+static const struct midr_range ca57_a72[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
{},
@@ -881,7 +865,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
{
.desc = "EL2 vector hardening",
.capability = ARM64_HARDEN_EL2_VECTORS,
- ERRATA_MIDR_RANGE_LIST(arm64_harden_el2_vectors),
+ ERRATA_MIDR_RANGE_LIST(ca57_a72),
},
#endif
{
@@ -927,6 +911,23 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
ERRATA_MIDR_RANGE_LIST(tx2_family_cpus),
},
#endif
+#ifdef CONFIG_ARM64_ERRATUM_1542419
+ {
+ /* we depend on the firmware portion for correctness */
+ .desc = "ARM erratum 1542419 (kernel portion)",
+ .capability = ARM64_WORKAROUND_1542419,
+ .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
+ .matches = has_neoverse_n1_erratum_1542419,
+ .cpu_enable = cpu_enable_trap_ctr_access,
+ },
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_1319367
+ {
+ .desc = "ARM erratum 1319367",
+ .capability = ARM64_WORKAROUND_1319367,
+ ERRATA_MIDR_RANGE_LIST(ca57_a72),
+ },
+#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 80f459ad0190..04cf64e9f0c9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -982,6 +982,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
+ MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL),
{ /* sentinel */ }
};
char const *str = "kpti command line option";
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 05933c065732..56bba746da1c 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -329,7 +329,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
info->reg_cntfrq = arch_timer_get_cntfrq();
/*
* Use the effective value of the CTR_EL0 than the raw value
- * exposed by the CPU. CTR_E0.IDC field value must be interpreted
+ * exposed by the CPU. CTR_EL0.IDC field value must be interpreted
* with the CLIDR_EL1 fields to avoid triggering false warnings
* when there is a mismatch across the CPUs. Keep track of the
* effective value of the CTR_EL0 in our internal records for
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
new file mode 100644
index 000000000000..5dce5e56995a
--- /dev/null
+++ b/arch/arm64/kernel/entry-common.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Exception handling code
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/ptrace.h>
+#include <linux/thread_info.h>
+
+#include <asm/cpufeature.h>
+#include <asm/daifflags.h>
+#include <asm/esr.h>
+#include <asm/exception.h>
+#include <asm/kprobes.h>
+#include <asm/mmu.h>
+#include <asm/sysreg.h>
+
+static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ local_daif_inherit(regs);
+ far = untagged_addr(far);
+ do_mem_abort(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el1_abort);
+
+static void notrace el1_pc(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ local_daif_inherit(regs);
+ do_sp_pc_abort(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el1_pc);
+
+static void el1_undef(struct pt_regs *regs)
+{
+ local_daif_inherit(regs);
+ do_undefinstr(regs);
+}
+NOKPROBE_SYMBOL(el1_undef);
+
+static void el1_inv(struct pt_regs *regs, unsigned long esr)
+{
+ local_daif_inherit(regs);
+ bad_mode(regs, 0, esr);
+}
+NOKPROBE_SYMBOL(el1_inv);
+
+static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ /*
+ * The CPU masked interrupts, and we are leaving them masked during
+ * do_debug_exception(). Update PMR as if we had called
+ * local_mask_daif().
+ */
+ if (system_uses_irq_prio_masking())
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+
+ do_debug_exception(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el1_dbg);
+
+asmlinkage void notrace el1_sync_handler(struct pt_regs *regs)
+{
+ unsigned long esr = read_sysreg(esr_el1);
+
+ switch (ESR_ELx_EC(esr)) {
+ case ESR_ELx_EC_DABT_CUR:
+ case ESR_ELx_EC_IABT_CUR:
+ el1_abort(regs, esr);
+ break;
+ /*
+ * We don't handle ESR_ELx_EC_SP_ALIGN, since we will have hit a
+ * recursive exception when trying to push the initial pt_regs.
+ */
+ case ESR_ELx_EC_PC_ALIGN:
+ el1_pc(regs, esr);
+ break;
+ case ESR_ELx_EC_SYS64:
+ case ESR_ELx_EC_UNKNOWN:
+ el1_undef(regs);
+ break;
+ case ESR_ELx_EC_BREAKPT_CUR:
+ case ESR_ELx_EC_SOFTSTP_CUR:
+ case ESR_ELx_EC_WATCHPT_CUR:
+ case ESR_ELx_EC_BRK64:
+ el1_dbg(regs, esr);
+ break;
+ default:
+ el1_inv(regs, esr);
+ };
+}
+NOKPROBE_SYMBOL(el1_sync_handler);
+
+static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ far = untagged_addr(far);
+ do_mem_abort(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el0_da);
+
+static void notrace el0_ia(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ /*
+ * We've taken an instruction abort from userspace and not yet
+ * re-enabled IRQs. If the address is a kernel address, apply
+ * BP hardening prior to enabling IRQs and pre-emption.
+ */
+ if (!is_ttbr0_addr(far))
+ arm64_apply_bp_hardening();
+
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_mem_abort(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el0_ia);
+
+static void notrace el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_fpsimd_acc(esr, regs);
+}
+NOKPROBE_SYMBOL(el0_fpsimd_acc);
+
+static void notrace el0_sve_acc(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_sve_acc(esr, regs);
+}
+NOKPROBE_SYMBOL(el0_sve_acc);
+
+static void notrace el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_fpsimd_exc(esr, regs);
+}
+NOKPROBE_SYMBOL(el0_fpsimd_exc);
+
+static void notrace el0_sys(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_sysinstr(esr, regs);
+}
+NOKPROBE_SYMBOL(el0_sys);
+
+static void notrace el0_pc(struct pt_regs *regs, unsigned long esr)
+{
+ unsigned long far = read_sysreg(far_el1);
+
+ if (!is_ttbr0_addr(instruction_pointer(regs)))
+ arm64_apply_bp_hardening();
+
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_sp_pc_abort(far, esr, regs);
+}
+NOKPROBE_SYMBOL(el0_pc);
+
+static void notrace el0_sp(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX_NOIRQ);
+ do_sp_pc_abort(regs->sp, esr, regs);
+}
+NOKPROBE_SYMBOL(el0_sp);
+
+static void notrace el0_undef(struct pt_regs *regs)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_undefinstr(regs);
+}
+NOKPROBE_SYMBOL(el0_undef);
+
+static void notrace el0_inv(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ bad_el0_sync(regs, 0, esr);
+}
+NOKPROBE_SYMBOL(el0_inv);
+
+static void notrace el0_dbg(struct pt_regs *regs, unsigned long esr)
+{
+ /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */
+ unsigned long far = read_sysreg(far_el1);
+
+ if (system_uses_irq_prio_masking())
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+
+ user_exit_irqoff();
+ do_debug_exception(far, esr, regs);
+ local_daif_restore(DAIF_PROCCTX_NOIRQ);
+}
+NOKPROBE_SYMBOL(el0_dbg);
+
+static void notrace el0_svc(struct pt_regs *regs)
+{
+ if (system_uses_irq_prio_masking())
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+
+ el0_svc_handler(regs);
+}
+NOKPROBE_SYMBOL(el0_svc);
+
+asmlinkage void notrace el0_sync_handler(struct pt_regs *regs)
+{
+ unsigned long esr = read_sysreg(esr_el1);
+
+ switch (ESR_ELx_EC(esr)) {
+ case ESR_ELx_EC_SVC64:
+ el0_svc(regs);
+ break;
+ case ESR_ELx_EC_DABT_LOW:
+ el0_da(regs, esr);
+ break;
+ case ESR_ELx_EC_IABT_LOW:
+ el0_ia(regs, esr);
+ break;
+ case ESR_ELx_EC_FP_ASIMD:
+ el0_fpsimd_acc(regs, esr);
+ break;
+ case ESR_ELx_EC_SVE:
+ el0_sve_acc(regs, esr);
+ break;
+ case ESR_ELx_EC_FP_EXC64:
+ el0_fpsimd_exc(regs, esr);
+ break;
+ case ESR_ELx_EC_SYS64:
+ case ESR_ELx_EC_WFx:
+ el0_sys(regs, esr);
+ break;
+ case ESR_ELx_EC_SP_ALIGN:
+ el0_sp(regs, esr);
+ break;
+ case ESR_ELx_EC_PC_ALIGN:
+ el0_pc(regs, esr);
+ break;
+ case ESR_ELx_EC_UNKNOWN:
+ el0_undef(regs);
+ break;
+ case ESR_ELx_EC_BREAKPT_LOW:
+ case ESR_ELx_EC_SOFTSTP_LOW:
+ case ESR_ELx_EC_WATCHPT_LOW:
+ case ESR_ELx_EC_BRK64:
+ el0_dbg(regs, esr);
+ break;
+ default:
+ el0_inv(regs, esr);
+ }
+}
+NOKPROBE_SYMBOL(el0_sync_handler);
+
+#ifdef CONFIG_COMPAT
+static void notrace el0_cp15(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_cp15instr(esr, regs);
+}
+NOKPROBE_SYMBOL(el0_cp15);
+
+static void notrace el0_svc_compat(struct pt_regs *regs)
+{
+ if (system_uses_irq_prio_masking())
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
+
+ el0_svc_compat_handler(regs);
+}
+NOKPROBE_SYMBOL(el0_svc_compat);
+
+asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs)
+{
+ unsigned long esr = read_sysreg(esr_el1);
+
+ switch (ESR_ELx_EC(esr)) {
+ case ESR_ELx_EC_SVC32:
+ el0_svc_compat(regs);
+ break;
+ case ESR_ELx_EC_DABT_LOW:
+ el0_da(regs, esr);
+ break;
+ case ESR_ELx_EC_IABT_LOW:
+ el0_ia(regs, esr);
+ break;
+ case ESR_ELx_EC_FP_ASIMD:
+ el0_fpsimd_acc(regs, esr);
+ break;
+ case ESR_ELx_EC_FP_EXC32:
+ el0_fpsimd_exc(regs, esr);
+ break;
+ case ESR_ELx_EC_PC_ALIGN:
+ el0_pc(regs, esr);
+ break;
+ case ESR_ELx_EC_UNKNOWN:
+ case ESR_ELx_EC_CP14_MR:
+ case ESR_ELx_EC_CP14_LS:
+ case ESR_ELx_EC_CP14_64:
+ el0_undef(regs);
+ break;
+ case ESR_ELx_EC_CP15_32:
+ case ESR_ELx_EC_CP15_64:
+ el0_cp15(regs, esr);
+ break;
+ case ESR_ELx_EC_BREAKPT_LOW:
+ case ESR_ELx_EC_SOFTSTP_LOW:
+ case ESR_ELx_EC_WATCHPT_LOW:
+ case ESR_ELx_EC_BKPT32:
+ el0_dbg(regs, esr);
+ break;
+ default:
+ el0_inv(regs, esr);
+ }
+}
+NOKPROBE_SYMBOL(el0_sync_compat_handler);
+#endif /* CONFIG_COMPAT */
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 33d003d80121..4fe1514fcbfd 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -7,10 +7,137 @@
*/
#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/ftrace.h>
#include <asm/insn.h>
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+/*
+ * Due to -fpatchable-function-entry=2, the compiler has placed two NOPs before
+ * the regular function prologue. For an enabled callsite, ftrace_init_nop() and
+ * ftrace_make_call() have patched those NOPs to:
+ *
+ * MOV X9, LR
+ * BL <entry>
+ *
+ * ... where <entry> is either ftrace_caller or ftrace_regs_caller.
+ *
+ * Each instrumented function follows the AAPCS, so here x0-x8 and x19-x30 are
+ * live, and x9-x18 are safe to clobber.
+ *
+ * We save the callsite's context into a pt_regs before invoking any ftrace
+ * callbacks. So that we can get a sensible backtrace, we create a stack record
+ * for the callsite and the ftrace entry assembly. This is not sufficient for
+ * reliable stacktrace: until we create the callsite stack record, its caller
+ * is missing from the LR and existing chain of frame records.
+ */
+ .macro ftrace_regs_entry, allregs=0
+ /* Make room for pt_regs, plus a callee frame */
+ sub sp, sp, #(S_FRAME_SIZE + 16)
+
+ /* Save function arguments (and x9 for simplicity) */
+ stp x0, x1, [sp, #S_X0]
+ stp x2, x3, [sp, #S_X2]
+ stp x4, x5, [sp, #S_X4]
+ stp x6, x7, [sp, #S_X6]
+ stp x8, x9, [sp, #S_X8]
+
+ /* Optionally save the callee-saved registers, always save the FP */
+ .if \allregs == 1
+ stp x10, x11, [sp, #S_X10]
+ stp x12, x13, [sp, #S_X12]
+ stp x14, x15, [sp, #S_X14]
+ stp x16, x17, [sp, #S_X16]
+ stp x18, x19, [sp, #S_X18]
+ stp x20, x21, [sp, #S_X20]
+ stp x22, x23, [sp, #S_X22]
+ stp x24, x25, [sp, #S_X24]
+ stp x26, x27, [sp, #S_X26]
+ stp x28, x29, [sp, #S_X28]
+ .else
+ str x29, [sp, #S_FP]
+ .endif
+
+ /* Save the callsite's SP and LR */
+ add x10, sp, #(S_FRAME_SIZE + 16)
+ stp x9, x10, [sp, #S_LR]
+
+ /* Save the PC after the ftrace callsite */
+ str x30, [sp, #S_PC]
+
+ /* Create a frame record for the callsite above pt_regs */
+ stp x29, x9, [sp, #S_FRAME_SIZE]
+ add x29, sp, #S_FRAME_SIZE
+
+ /* Create our frame record within pt_regs. */
+ stp x29, x30, [sp, #S_STACKFRAME]
+ add x29, sp, #S_STACKFRAME
+ .endm
+
+ENTRY(ftrace_regs_caller)
+ ftrace_regs_entry 1
+ b ftrace_common
+ENDPROC(ftrace_regs_caller)
+
+ENTRY(ftrace_caller)
+ ftrace_regs_entry 0
+ b ftrace_common
+ENDPROC(ftrace_caller)
+
+ENTRY(ftrace_common)
+ sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn)
+ mov x1, x9 // parent_ip (callsite's LR)
+ ldr_l x2, function_trace_op // op
+ mov x3, sp // regs
+
+GLOBAL(ftrace_call)
+ bl ftrace_stub
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call) // ftrace_graph_caller();
+ nop // If enabled, this will be replaced
+ // "b ftrace_graph_caller"
+#endif
+
+/*
+ * At the callsite x0-x8 and x19-x30 were live. Any C code will have preserved
+ * x19-x29 per the AAPCS, and we created frame records upon entry, so we need
+ * to restore x0-x8, x29, and x30.
+ */
+ftrace_common_return:
+ /* Restore function arguments */
+ ldp x0, x1, [sp]
+ ldp x2, x3, [sp, #S_X2]
+ ldp x4, x5, [sp, #S_X4]
+ ldp x6, x7, [sp, #S_X6]
+ ldr x8, [sp, #S_X8]
+
+ /* Restore the callsite's FP, LR, PC */
+ ldr x29, [sp, #S_FP]
+ ldr x30, [sp, #S_LR]
+ ldr x9, [sp, #S_PC]
+
+ /* Restore the callsite's SP */
+ add sp, sp, #S_FRAME_SIZE + 16
+
+ ret x9
+ENDPROC(ftrace_common)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ ldr x0, [sp, #S_PC]
+ sub x0, x0, #AARCH64_INSN_SIZE // ip (callsite's BL insn)
+ add x1, sp, #S_LR // parent_ip (callsite's LR)
+ ldr x2, [sp, #S_FRAME_SIZE] // parent fp (callsite's FP)
+ bl prepare_ftrace_return
+ b ftrace_common_return
+ENDPROC(ftrace_graph_caller)
+#else
+#endif
+
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
/*
* Gcc with -pg will put the following code in the beginning of each function:
* mov x0, x30
@@ -160,11 +287,6 @@ GLOBAL(ftrace_graph_call) // ftrace_graph_caller();
mcount_exit
ENDPROC(ftrace_caller)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(ftrace_stub)
- ret
-ENDPROC(ftrace_stub)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/*
@@ -184,7 +306,15 @@ ENTRY(ftrace_graph_caller)
mcount_exit
ENDPROC(ftrace_graph_caller)
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
+ENTRY(ftrace_stub)
+ ret
+ENDPROC(ftrace_stub)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/*
* void return_to_handler(void)
*
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index cf3bd2976e57..583f71abbe98 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -269,8 +269,10 @@ alternative_else_nop_endif
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
ldr x20, [sp, #S_PMR_SAVE]
msr_s SYS_ICC_PMR_EL1, x20
- /* Ensure priority change is seen by redistributor */
- dsb sy
+ mrs_s x21, SYS_ICC_CTLR_EL1
+ tbz x21, #6, .L__skip_pmr_sync\@ // Check for ICC_CTLR_EL1.PMHE
+ dsb sy // Ensure priority change is seen by redistributor
+.L__skip_pmr_sync\@:
alternative_else_nop_endif
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
@@ -578,76 +580,9 @@ ENDPROC(el1_error_invalid)
.align 6
el1_sync:
kernel_entry 1
- mrs x1, esr_el1 // read the syndrome register
- lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
- cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
- b.eq el1_da
- cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1
- b.eq el1_ia
- cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
- b.eq el1_undef
- cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
- b.eq el1_pc
- cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1
- b.eq el1_undef
- cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1
- b.ge el1_dbg
- b el1_inv
-
-el1_ia:
- /*
- * Fall through to the Data abort case
- */
-el1_da:
- /*
- * Data abort handling
- */
- mrs x3, far_el1
- inherit_daif pstate=x23, tmp=x2
- untagged_addr x0, x3
- mov x2, sp // struct pt_regs
- bl do_mem_abort
-
- kernel_exit 1
-el1_pc:
- /*
- * PC alignment exception handling. We don't handle SP alignment faults,
- * since we will have hit a recursive exception when trying to push the
- * initial pt_regs.
- */
- mrs x0, far_el1
- inherit_daif pstate=x23, tmp=x2
- mov x2, sp
- bl do_sp_pc_abort
- ASM_BUG()
-el1_undef:
- /*
- * Undefined instruction
- */
- inherit_daif pstate=x23, tmp=x2
mov x0, sp
- bl do_undefinstr
+ bl el1_sync_handler
kernel_exit 1
-el1_dbg:
- /*
- * Debug exception handling
- */
- cmp x24, #ESR_ELx_EC_BRK64 // if BRK64
- cinc x24, x24, eq // set bit '0'
- tbz x24, #0, el1_inv // EL1 only
- gic_prio_kentry_setup tmp=x3
- mrs x0, far_el1
- mov x2, sp // struct pt_regs
- bl do_debug_exception
- kernel_exit 1
-el1_inv:
- // TODO: add support for undefined instructions in kernel mode
- inherit_daif pstate=x23, tmp=x2
- mov x0, sp
- mov x2, x1
- mov x1, #BAD_SYNC
- bl bad_mode
- ASM_BUG()
ENDPROC(el1_sync)
.align 6
@@ -714,71 +649,18 @@ ENDPROC(el1_irq)
.align 6
el0_sync:
kernel_entry 0
- mrs x25, esr_el1 // read the syndrome register
- lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
- cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
- b.eq el0_svc
- cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
- b.eq el0_da
- cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
- b.eq el0_ia
- cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
- b.eq el0_fpsimd_acc
- cmp x24, #ESR_ELx_EC_SVE // SVE access
- b.eq el0_sve_acc
- cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
- b.eq el0_fpsimd_exc
- cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
- ccmp x24, #ESR_ELx_EC_WFx, #4, ne
- b.eq el0_sys
- cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
- b.eq el0_sp
- cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
- b.eq el0_pc
- cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
- b.eq el0_undef
- cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
- b.ge el0_dbg
- b el0_inv
+ mov x0, sp
+ bl el0_sync_handler
+ b ret_to_user
#ifdef CONFIG_COMPAT
.align 6
el0_sync_compat:
kernel_entry 0, 32
- mrs x25, esr_el1 // read the syndrome register
- lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
- cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state
- b.eq el0_svc_compat
- cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
- b.eq el0_da
- cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
- b.eq el0_ia
- cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
- b.eq el0_fpsimd_acc
- cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception
- b.eq el0_fpsimd_exc
- cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
- b.eq el0_pc
- cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
- b.eq el0_undef
- cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap
- b.eq el0_cp15
- cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap
- b.eq el0_cp15
- cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap
- b.eq el0_undef
- cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap
- b.eq el0_undef
- cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap
- b.eq el0_undef
- cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
- b.ge el0_dbg
- b el0_inv
-el0_svc_compat:
- gic_prio_kentry_setup tmp=x1
mov x0, sp
- bl el0_svc_compat_handler
+ bl el0_sync_compat_handler
b ret_to_user
+ENDPROC(el0_sync)
.align 6
el0_irq_compat:
@@ -788,139 +670,7 @@ el0_irq_compat:
el0_error_compat:
kernel_entry 0, 32
b el0_error_naked
-
-el0_cp15:
- /*
- * Trapped CP15 (MRC, MCR, MRRC, MCRR) instructions
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, x25
- mov x1, sp
- bl do_cp15instr
- b ret_to_user
-#endif
-
-el0_da:
- /*
- * Data abort handling
- */
- mrs x26, far_el1
- ct_user_exit_irqoff
- enable_daif
- untagged_addr x0, x26
- mov x1, x25
- mov x2, sp
- bl do_mem_abort
- b ret_to_user
-el0_ia:
- /*
- * Instruction abort handling
- */
- mrs x26, far_el1
- gic_prio_kentry_setup tmp=x0
- ct_user_exit_irqoff
- enable_da_f
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl trace_hardirqs_off
-#endif
- mov x0, x26
- mov x1, x25
- mov x2, sp
- bl do_el0_ia_bp_hardening
- b ret_to_user
-el0_fpsimd_acc:
- /*
- * Floating Point or Advanced SIMD access
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, x25
- mov x1, sp
- bl do_fpsimd_acc
- b ret_to_user
-el0_sve_acc:
- /*
- * Scalable Vector Extension access
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, x25
- mov x1, sp
- bl do_sve_acc
- b ret_to_user
-el0_fpsimd_exc:
- /*
- * Floating Point, Advanced SIMD or SVE exception
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, x25
- mov x1, sp
- bl do_fpsimd_exc
- b ret_to_user
-el0_sp:
- ldr x26, [sp, #S_SP]
- b el0_sp_pc
-el0_pc:
- mrs x26, far_el1
-el0_sp_pc:
- /*
- * Stack or PC alignment exception handling
- */
- gic_prio_kentry_setup tmp=x0
- ct_user_exit_irqoff
- enable_da_f
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl trace_hardirqs_off
#endif
- mov x0, x26
- mov x1, x25
- mov x2, sp
- bl do_sp_pc_abort
- b ret_to_user
-el0_undef:
- /*
- * Undefined instruction
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, sp
- bl do_undefinstr
- b ret_to_user
-el0_sys:
- /*
- * System instructions, for trapped cache maintenance instructions
- */
- ct_user_exit_irqoff
- enable_daif
- mov x0, x25
- mov x1, sp
- bl do_sysinstr
- b ret_to_user
-el0_dbg:
- /*
- * Debug exception handling
- */
- tbnz x24, #0, el0_inv // EL0 only
- mrs x24, far_el1
- gic_prio_kentry_setup tmp=x3
- ct_user_exit_irqoff
- mov x0, x24
- mov x1, x25
- mov x2, sp
- bl do_debug_exception
- enable_da_f
- b ret_to_user
-el0_inv:
- ct_user_exit_irqoff
- enable_daif
- mov x0, sp
- mov x1, #BAD_SYNC
- mov x2, x25
- bl bad_el0_sync
- b ret_to_user
-ENDPROC(el0_sync)
.align 6
el0_irq:
@@ -999,17 +749,6 @@ finish_ret_to_user:
kernel_exit 0
ENDPROC(ret_to_user)
-/*
- * SVC handler.
- */
- .align 6
-el0_svc:
- gic_prio_kentry_setup tmp=x1
- mov x0, sp
- bl el0_svc_handler
- b ret_to_user
-ENDPROC(el0_svc)
-
.popsection // .entry.text
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 37d3912cfe06..3eb338f14386 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -920,7 +920,7 @@ void fpsimd_release_task(struct task_struct *dead_task)
* would have disabled the SVE access trap for userspace during
* ret_to_user, making an SVE access trap impossible in that case.
*/
-asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs)
+void do_sve_acc(unsigned int esr, struct pt_regs *regs)
{
/* Even if we chose not to use SVE, the hardware could still trap: */
if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) {
@@ -947,7 +947,7 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs)
/*
* Trapped FP/ASIMD access.
*/
-asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
+void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
{
/* TODO: implement lazy context saving/restoring */
WARN_ON(1);
@@ -956,7 +956,7 @@ asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
/*
* Raise a SIGFPE for the current process.
*/
-asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
+void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
{
unsigned int si_code = FPE_FLTUNK;
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 06e56b470315..8618faa82e6d 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -62,6 +62,19 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ftrace_modify_code(pc, 0, new, false);
}
+static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
+{
+#ifdef CONFIG_ARM64_MODULE_PLTS
+ struct plt_entry *plt = mod->arch.ftrace_trampolines;
+
+ if (addr == FTRACE_ADDR)
+ return &plt[FTRACE_PLT_IDX];
+ if (addr == FTRACE_REGS_ADDR && IS_ENABLED(CONFIG_FTRACE_WITH_REGS))
+ return &plt[FTRACE_REGS_PLT_IDX];
+#endif
+ return NULL;
+}
+
/*
* Turn on the call to ftrace_caller() in instrumented function
*/
@@ -72,9 +85,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
long offset = (long)pc - (long)addr;
if (offset < -SZ_128M || offset >= SZ_128M) {
-#ifdef CONFIG_ARM64_MODULE_PLTS
- struct plt_entry trampoline, *dst;
struct module *mod;
+ struct plt_entry *plt;
+
+ if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+ return -EINVAL;
/*
* On kernels that support module PLTs, the offset between the
@@ -93,49 +108,13 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
if (WARN_ON(!mod))
return -EINVAL;
- /*
- * There is only one ftrace trampoline per module. For now,
- * this is not a problem since on arm64, all dynamic ftrace
- * invocations are routed via ftrace_caller(). This will need
- * to be revisited if support for multiple ftrace entry points
- * is added in the future, but for now, the pr_err() below
- * deals with a theoretical issue only.
- *
- * Note that PLTs are place relative, and plt_entries_equal()
- * checks whether they point to the same target. Here, we need
- * to check if the actual opcodes are in fact identical,
- * regardless of the offset in memory so use memcmp() instead.
- */
- dst = mod->arch.ftrace_trampoline;
- trampoline = get_plt_entry(addr, dst);
- if (memcmp(dst, &trampoline, sizeof(trampoline))) {
- if (plt_entry_is_initialized(dst)) {
- pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n");
- return -EINVAL;
- }
-
- /* point the trampoline to our ftrace entry point */
- module_disable_ro(mod);
- *dst = trampoline;
- module_enable_ro(mod, true);
-
- /*
- * Ensure updated trampoline is visible to instruction
- * fetch before we patch in the branch. Although the
- * architecture doesn't require an IPI in this case,
- * Neoverse-N1 erratum #1542419 does require one
- * if the TLB maintenance in module_enable_ro() is
- * skipped due to rodata_enabled. It doesn't seem worth
- * it to make it conditional given that this is
- * certainly not a fast-path.
- */
- flush_icache_range((unsigned long)&dst[0],
- (unsigned long)&dst[1]);
+ plt = get_ftrace_plt(mod, addr);
+ if (!plt) {
+ pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
+ return -EINVAL;
}
- addr = (unsigned long)dst;
-#else /* CONFIG_ARM64_MODULE_PLTS */
- return -EINVAL;
-#endif /* CONFIG_ARM64_MODULE_PLTS */
+
+ addr = (unsigned long)plt;
}
old = aarch64_insn_gen_nop();
@@ -144,6 +123,55 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
return ftrace_modify_code(pc, old, new, true);
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ unsigned long pc = rec->ip;
+ u32 old, new;
+
+ old = aarch64_insn_gen_branch_imm(pc, old_addr,
+ AARCH64_INSN_BRANCH_LINK);
+ new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
+
+ return ftrace_modify_code(pc, old, new, true);
+}
+
+/*
+ * The compiler has inserted two NOPs before the regular function prologue.
+ * All instrumented functions follow the AAPCS, so x0-x8 and x19-x30 are live,
+ * and x9-x18 are free for our use.
+ *
+ * At runtime we want to be able to swing a single NOP <-> BL to enable or
+ * disable the ftrace call. The BL requires us to save the original LR value,
+ * so here we insert a <MOV X9, LR> over the first NOP so the instructions
+ * before the regular prologue are:
+ *
+ * | Compiled | Disabled | Enabled |
+ * +----------+------------+------------+
+ * | NOP | MOV X9, LR | MOV X9, LR |
+ * | NOP | NOP | BL <entry> |
+ *
+ * The LR value will be recovered by ftrace_regs_entry, and restored into LR
+ * before returning to the regular function prologue. When a function is not
+ * being traced, the MOV is not harmful given x9 is not live per the AAPCS.
+ *
+ * Note: ftrace_process_locs() has pre-adjusted rec->ip to be the address of
+ * the BL.
+ */
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
+{
+ unsigned long pc = rec->ip - AARCH64_INSN_SIZE;
+ u32 old, new;
+
+ old = aarch64_insn_gen_nop();
+ new = aarch64_insn_gen_move_reg(AARCH64_INSN_REG_9,
+ AARCH64_INSN_REG_LR,
+ AARCH64_INSN_VARIANT_64BIT);
+ return ftrace_modify_code(pc, old, new, true);
+}
+#endif
+
/*
* Turn off the call to ftrace_caller() in instrumented function
*/
@@ -156,9 +184,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
long offset = (long)pc - (long)addr;
if (offset < -SZ_128M || offset >= SZ_128M) {
-#ifdef CONFIG_ARM64_MODULE_PLTS
u32 replaced;
+ if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+ return -EINVAL;
+
/*
* 'mod' is only set at module load time, but if we end up
* dealing with an out-of-range condition, we can assume it
@@ -189,9 +219,6 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
return -EINVAL;
validate = false;
-#else /* CONFIG_ARM64_MODULE_PLTS */
- return -EINVAL;
-#endif /* CONFIG_ARM64_MODULE_PLTS */
} else {
old = aarch64_insn_gen_branch_imm(pc, addr,
AARCH64_INSN_BRANCH_LINK);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 38ee1514cd9c..0b727edf4104 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -51,7 +51,7 @@ int hw_breakpoint_slots(int type)
case TYPE_DATA:
return get_num_wrps();
default:
- pr_warning("unknown slot type: %d\n", type);
+ pr_warn("unknown slot type: %d\n", type);
return 0;
}
}
@@ -112,7 +112,7 @@ static u64 read_wb_reg(int reg, int n)
GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val);
GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val);
default:
- pr_warning("attempt to read from unknown breakpoint register %d\n", n);
+ pr_warn("attempt to read from unknown breakpoint register %d\n", n);
}
return val;
@@ -127,7 +127,7 @@ static void write_wb_reg(int reg, int n, u64 val)
GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val);
GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val);
default:
- pr_warning("attempt to write to unknown breakpoint register %d\n", n);
+ pr_warn("attempt to write to unknown breakpoint register %d\n", n);
}
isb();
}
@@ -145,7 +145,7 @@ static enum dbg_active_el debug_exception_level(int privilege)
case AARCH64_BREAKPOINT_EL1:
return DBG_ACTIVE_EL1;
default:
- pr_warning("invalid breakpoint privilege level %d\n", privilege);
+ pr_warn("invalid breakpoint privilege level %d\n", privilege);
return -EINVAL;
}
}
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index d801a7094076..513b29c3e735 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -1268,6 +1268,19 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
}
+/*
+ * MOV (register) is architecturally an alias of ORR (shifted register) where
+ * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
+ */
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+ enum aarch64_insn_register src,
+ enum aarch64_insn_variant variant)
+{
+ return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
+ src, 0, variant,
+ AARCH64_INSN_LOGIC_ORR);
+}
+
u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
enum aarch64_insn_register reg,
enum aarch64_insn_adr_type type)
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 416f537bf614..2a11a962e571 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -19,6 +19,14 @@
#include <asm/pgtable.h>
#include <asm/sections.h>
+enum kaslr_status {
+ KASLR_ENABLED,
+ KASLR_DISABLED_CMDLINE,
+ KASLR_DISABLED_NO_SEED,
+ KASLR_DISABLED_FDT_REMAP,
+};
+
+static enum kaslr_status __initdata kaslr_status;
u64 __ro_after_init module_alloc_base;
u16 __initdata memstart_offset_seed;
@@ -91,15 +99,15 @@ u64 __init kaslr_early_init(u64 dt_phys)
*/
early_fixmap_init();
fdt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
- if (!fdt)
+ if (!fdt) {
+ kaslr_status = KASLR_DISABLED_FDT_REMAP;
return 0;
+ }
/*
* Retrieve (and wipe) the seed from the FDT
*/
seed = get_kaslr_seed(fdt);
- if (!seed)
- return 0;
/*
* Check if 'nokaslr' appears on the command line, and
@@ -107,8 +115,15 @@ u64 __init kaslr_early_init(u64 dt_phys)
*/
cmdline = kaslr_get_cmdline(fdt);
str = strstr(cmdline, "nokaslr");
- if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+ if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) {
+ kaslr_status = KASLR_DISABLED_CMDLINE;
+ return 0;
+ }
+
+ if (!seed) {
+ kaslr_status = KASLR_DISABLED_NO_SEED;
return 0;
+ }
/*
* OK, so we are proceeding with KASLR enabled. Calculate a suitable
@@ -170,3 +185,24 @@ u64 __init kaslr_early_init(u64 dt_phys)
return offset;
}
+
+static int __init kaslr_init(void)
+{
+ switch (kaslr_status) {
+ case KASLR_ENABLED:
+ pr_info("KASLR enabled\n");
+ break;
+ case KASLR_DISABLED_CMDLINE:
+ pr_info("KASLR disabled on command line\n");
+ break;
+ case KASLR_DISABLED_NO_SEED:
+ pr_warn("KASLR disabled due to lack of seed\n");
+ break;
+ case KASLR_DISABLED_FDT_REMAP:
+ pr_warn("KASLR disabled due to FDT remapping failure\n");
+ break;
+ }
+
+ return 0;
+}
+core_initcall(kaslr_init)
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index b182442b87a3..65b08a74aec6 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -4,6 +4,7 @@
*/
#include <linux/elf.h>
+#include <linux/ftrace.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sort.h>
@@ -330,7 +331,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
tramp->sh_type = SHT_NOBITS;
tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
tramp->sh_addralign = __alignof__(struct plt_entry);
- tramp->sh_size = sizeof(struct plt_entry);
+ tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry);
}
return 0;
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 03ff15bffbb6..1cd1a4d0ed30 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -9,6 +9,7 @@
#include <linux/bitops.h>
#include <linux/elf.h>
+#include <linux/ftrace.h>
#include <linux/gfp.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
@@ -470,22 +471,58 @@ overflow:
return -ENOEXEC;
}
-int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
+static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ const char *name)
{
const Elf_Shdr *s, *se;
const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
- if (strcmp(".altinstructions", secstrs + s->sh_name) == 0)
- apply_alternatives_module((void *)s->sh_addr, s->sh_size);
-#ifdef CONFIG_ARM64_MODULE_PLTS
- if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) &&
- !strcmp(".text.ftrace_trampoline", secstrs + s->sh_name))
- me->arch.ftrace_trampoline = (void *)s->sh_addr;
-#endif
+ if (strcmp(name, secstrs + s->sh_name) == 0)
+ return s;
}
+ return NULL;
+}
+
+static inline void __init_plt(struct plt_entry *plt, unsigned long addr)
+{
+ *plt = get_plt_entry(addr, plt);
+}
+
+static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *mod)
+{
+#if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE)
+ const Elf_Shdr *s;
+ struct plt_entry *plts;
+
+ s = find_section(hdr, sechdrs, ".text.ftrace_trampoline");
+ if (!s)
+ return -ENOEXEC;
+
+ plts = (void *)s->sh_addr;
+
+ __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);
+
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
+ __init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR);
+
+ mod->arch.ftrace_trampolines = plts;
+#endif
return 0;
}
+
+int module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s;
+ s = find_section(hdr, sechdrs, ".altinstructions");
+ if (s)
+ apply_alternatives_module((void *)s->sh_addr, s->sh_size);
+
+ return module_init_ftrace_plt(hdr, sechdrs, me);
+}
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 4cfed91fe256..1ef702b0be2d 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -6,13 +6,153 @@
* Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
*/
+#define pr_fmt(fmt) "arm-pv: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/cpuhotplug.h>
#include <linux/export.h>
+#include <linux/io.h>
#include <linux/jump_label.h>
+#include <linux/printk.h>
+#include <linux/psci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
#include <linux/types.h>
+
#include <asm/paravirt.h>
+#include <asm/pvclock-abi.h>
+#include <asm/smp_plat.h>
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
struct paravirt_patch_template pv_ops;
EXPORT_SYMBOL_GPL(pv_ops);
+
+struct pv_time_stolen_time_region {
+ struct pvclock_vcpu_stolen_time *kaddr;
+};
+
+static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
+
+static bool steal_acc = true;
+static int __init parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
+/* return stolen time in ns by asking the hypervisor */
+static u64 pv_steal_clock(int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+
+ reg = per_cpu_ptr(&stolen_time_region, cpu);
+ if (!reg->kaddr) {
+ pr_warn_once("stolen time enabled but not configured for cpu %d\n",
+ cpu);
+ return 0;
+ }
+
+ return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time));
+}
+
+static int stolen_time_dying_cpu(unsigned int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+
+ reg = this_cpu_ptr(&stolen_time_region);
+ if (!reg->kaddr)
+ return 0;
+
+ memunmap(reg->kaddr);
+ memset(reg, 0, sizeof(*reg));
+
+ return 0;
+}
+
+static int init_stolen_time_cpu(unsigned int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+ struct arm_smccc_res res;
+
+ reg = this_cpu_ptr(&stolen_time_region);
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_ST, &res);
+
+ if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
+ return -EINVAL;
+
+ reg->kaddr = memremap(res.a0,
+ sizeof(struct pvclock_vcpu_stolen_time),
+ MEMREMAP_WB);
+
+ if (!reg->kaddr) {
+ pr_warn("Failed to map stolen time data structure\n");
+ return -ENOMEM;
+ }
+
+ if (le32_to_cpu(reg->kaddr->revision) != 0 ||
+ le32_to_cpu(reg->kaddr->attributes) != 0) {
+ pr_warn_once("Unexpected revision or attributes in stolen time data\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int pv_time_init_stolen_time(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ARM_KVMPV_STARTING,
+ "hypervisor/arm/pvtime:starting",
+ init_stolen_time_cpu, stolen_time_dying_cpu);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static bool has_pv_steal_clock(void)
+{
+ struct arm_smccc_res res;
+
+ /* To detect the presence of PV time support we require SMCCC 1.1+ */
+ if (psci_ops.smccc_version < SMCCC_VERSION_1_1)
+ return false;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_HV_PV_TIME_FEATURES, &res);
+
+ if (res.a0 != SMCCC_RET_SUCCESS)
+ return false;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_FEATURES,
+ ARM_SMCCC_HV_PV_TIME_ST, &res);
+
+ return (res.a0 == SMCCC_RET_SUCCESS);
+}
+
+int __init pv_time_init(void)
+{
+ int ret;
+
+ if (!has_pv_steal_clock())
+ return 0;
+
+ ret = pv_time_init_stolen_time();
+ if (ret)
+ return ret;
+
+ pv_ops.time.steal_clock = pv_steal_clock;
+
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+
+ pr_info("using stolen time PV\n");
+
+ return 0;
+}
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index a0b4f1bca491..e40b65645c86 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -158,133 +158,74 @@ armv8pmu_events_sysfs_show(struct device *dev,
return sprintf(page, "event=0x%03llx\n", pmu_attr->id);
}
-#define ARMV8_EVENT_ATTR(name, config) \
- PMU_EVENT_ATTR(name, armv8_event_attr_##name, \
- config, armv8pmu_events_sysfs_show)
-
-ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR);
-ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL);
-ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL);
-ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL);
-ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE);
-ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL);
-ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED);
-ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED);
-ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED);
-ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN);
-ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN);
-ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED);
-ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED);
-ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED);
-ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED);
-ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED);
-ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED);
-ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES);
-ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED);
-ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS);
-ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE);
-ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB);
-ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE);
-ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL);
-ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB);
-ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS);
-ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR);
-ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC);
-ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED);
-ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES);
-/* Don't expose the chain event in /sys, since it's useless in isolation */
-ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE);
-ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE);
-ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED);
-ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED);
-ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND);
-ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND);
-ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB);
-ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB);
-ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE);
-ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL);
-ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE);
-ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL);
-ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE);
-ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB);
-ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL);
-ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL);
-ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB);
-ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB);
-ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS);
-ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE);
-ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS);
-ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK);
-ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK);
-ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD);
-ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD);
-ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD);
-ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP);
-ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED);
-ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE);
-ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION);
+#define ARMV8_EVENT_ATTR(name, config) \
+ (&((struct perf_pmu_events_attr) { \
+ .attr = __ATTR(name, 0444, armv8pmu_events_sysfs_show, NULL), \
+ .id = config, \
+ }).attr.attr)
static struct attribute *armv8_pmuv3_event_attrs[] = {
- &armv8_event_attr_sw_incr.attr.attr,
- &armv8_event_attr_l1i_cache_refill.attr.attr,
- &armv8_event_attr_l1i_tlb_refill.attr.attr,
- &armv8_event_attr_l1d_cache_refill.attr.attr,
- &armv8_event_attr_l1d_cache.attr.attr,
- &armv8_event_attr_l1d_tlb_refill.attr.attr,
- &armv8_event_attr_ld_retired.attr.attr,
- &armv8_event_attr_st_retired.attr.attr,
- &armv8_event_attr_inst_retired.attr.attr,
- &armv8_event_attr_exc_taken.attr.attr,
- &armv8_event_attr_exc_return.attr.attr,
- &armv8_event_attr_cid_write_retired.attr.attr,
- &armv8_event_attr_pc_write_retired.attr.attr,
- &armv8_event_attr_br_immed_retired.attr.attr,
- &armv8_event_attr_br_return_retired.attr.attr,
- &armv8_event_attr_unaligned_ldst_retired.attr.attr,
- &armv8_event_attr_br_mis_pred.attr.attr,
- &armv8_event_attr_cpu_cycles.attr.attr,
- &armv8_event_attr_br_pred.attr.attr,
- &armv8_event_attr_mem_access.attr.attr,
- &armv8_event_attr_l1i_cache.attr.attr,
- &armv8_event_attr_l1d_cache_wb.attr.attr,
- &armv8_event_attr_l2d_cache.attr.attr,
- &armv8_event_attr_l2d_cache_refill.attr.attr,
- &armv8_event_attr_l2d_cache_wb.attr.attr,
- &armv8_event_attr_bus_access.attr.attr,
- &armv8_event_attr_memory_error.attr.attr,
- &armv8_event_attr_inst_spec.attr.attr,
- &armv8_event_attr_ttbr_write_retired.attr.attr,
- &armv8_event_attr_bus_cycles.attr.attr,
- &armv8_event_attr_l1d_cache_allocate.attr.attr,
- &armv8_event_attr_l2d_cache_allocate.attr.attr,
- &armv8_event_attr_br_retired.attr.attr,
- &armv8_event_attr_br_mis_pred_retired.attr.attr,
- &armv8_event_attr_stall_frontend.attr.attr,
- &armv8_event_attr_stall_backend.attr.attr,
- &armv8_event_attr_l1d_tlb.attr.attr,
- &armv8_event_attr_l1i_tlb.attr.attr,
- &armv8_event_attr_l2i_cache.attr.attr,
- &armv8_event_attr_l2i_cache_refill.attr.attr,
- &armv8_event_attr_l3d_cache_allocate.attr.attr,
- &armv8_event_attr_l3d_cache_refill.attr.attr,
- &armv8_event_attr_l3d_cache.attr.attr,
- &armv8_event_attr_l3d_cache_wb.attr.attr,
- &armv8_event_attr_l2d_tlb_refill.attr.attr,
- &armv8_event_attr_l2i_tlb_refill.attr.attr,
- &armv8_event_attr_l2d_tlb.attr.attr,
- &armv8_event_attr_l2i_tlb.attr.attr,
- &armv8_event_attr_remote_access.attr.attr,
- &armv8_event_attr_ll_cache.attr.attr,
- &armv8_event_attr_ll_cache_miss.attr.attr,
- &armv8_event_attr_dtlb_walk.attr.attr,
- &armv8_event_attr_itlb_walk.attr.attr,
- &armv8_event_attr_ll_cache_rd.attr.attr,
- &armv8_event_attr_ll_cache_miss_rd.attr.attr,
- &armv8_event_attr_remote_access_rd.attr.attr,
- &armv8_event_attr_sample_pop.attr.attr,
- &armv8_event_attr_sample_feed.attr.attr,
- &armv8_event_attr_sample_filtrate.attr.attr,
- &armv8_event_attr_sample_collision.attr.attr,
+ ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR),
+ ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL),
+ ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL),
+ ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL),
+ ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE),
+ ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL),
+ ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED),
+ ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED),
+ ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED),
+ ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN),
+ ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN),
+ ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED),
+ ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED),
+ ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED),
+ ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED),
+ ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED),
+ ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED),
+ ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES),
+ ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED),
+ ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS),
+ ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE),
+ ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB),
+ ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE),
+ ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL),
+ ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB),
+ ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS),
+ ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR),
+ ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC),
+ ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED),
+ ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES),
+ /* Don't expose the chain event in /sys, since it's useless in isolation */
+ ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE),
+ ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE),
+ ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED),
+ ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED),
+ ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND),
+ ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND),
+ ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB),
+ ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB),
+ ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE),
+ ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL),
+ ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE),
+ ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL),
+ ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE),
+ ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB),
+ ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL),
+ ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL),
+ ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB),
+ ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB),
+ ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS),
+ ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE),
+ ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS),
+ ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK),
+ ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK),
+ ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD),
+ ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD),
+ ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD),
+ ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP),
+ ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED),
+ ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE),
+ ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION),
NULL,
};
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index c4452827419b..d1c95dcf1d78 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -455,10 +455,6 @@ int __init arch_populate_kprobe_blacklist(void)
(unsigned long)__irqentry_text_end);
if (ret)
return ret;
- ret = kprobe_add_area_blacklist((unsigned long)__exception_text_start,
- (unsigned long)__exception_text_end);
- if (ret)
- return ret;
ret = kprobe_add_area_blacklist((unsigned long)__idmap_text_start,
(unsigned long)__idmap_text_end);
if (ret)
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index c9f72b2665f1..43ae4e0c968f 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -81,7 +81,8 @@ static void cpu_psci_cpu_die(unsigned int cpu)
static int cpu_psci_cpu_kill(unsigned int cpu)
{
- int err, i;
+ int err;
+ unsigned long start, end;
if (!psci_ops.affinity_info)
return 0;
@@ -91,16 +92,18 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
* while it is dying. So, try again a few times.
*/
- for (i = 0; i < 10; i++) {
+ start = jiffies;
+ end = start + msecs_to_jiffies(100);
+ do {
err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
- pr_info("CPU%d killed.\n", cpu);
+ pr_info("CPU%d killed (polled %d ms)\n", cpu,
+ jiffies_to_msecs(jiffies - start));
return 0;
}
- msleep(10);
- pr_info("Retrying again to check for CPU kill\n");
- }
+ usleep_range(100, 1000);
+ } while (time_before(jiffies, end));
pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
cpu, err);
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index ea94cf8f9dc6..d6259dac62b6 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -2,6 +2,7 @@
// Copyright (C) 2017 Arm Ltd.
#define pr_fmt(fmt) "sdei: " fmt
+#include <linux/arm-smccc.h>
#include <linux/arm_sdei.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
@@ -161,7 +162,7 @@ unsigned long sdei_arch_get_entry_point(int conduit)
return 0;
}
- sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC;
+ sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC;
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
if (arm64_kernel_unmapped_at_el0()) {
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dc9fe879c279..ab149bcc3dc7 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -345,8 +345,7 @@ void __cpu_die(unsigned int cpu)
*/
err = op_cpu_kill(cpu);
if (err)
- pr_warn("CPU%d may not have shut down cleanly: %d\n",
- cpu, err);
+ pr_warn("CPU%d may not have shut down cleanly: %d\n", cpu, err);
}
/*
@@ -976,8 +975,8 @@ void smp_send_stop(void)
udelay(1);
if (num_online_cpus() > 1)
- pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
- cpumask_pr_args(cpu_online_mask));
+ pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
+ cpumask_pr_args(cpu_online_mask));
sdei_mask_local_cpu();
}
@@ -1017,8 +1016,8 @@ void crash_smp_send_stop(void)
udelay(1);
if (atomic_read(&waiting_for_crash_ipi) > 0)
- pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
- cpumask_pr_args(&mask));
+ pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
+ cpumask_pr_args(&mask));
sdei_mask_local_cpu();
}
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index f1cb64959427..3c18c2454089 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -8,6 +8,7 @@
*/
#include <linux/compat.h>
+#include <linux/cpufeature.h>
#include <linux/personality.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
@@ -17,6 +18,7 @@
#include <asm/cacheflush.h>
#include <asm/system_misc.h>
+#include <asm/tlbflush.h>
#include <asm/unistd.h>
static long
@@ -30,6 +32,15 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
if (fatal_signal_pending(current))
return 0;
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
+ /*
+ * The workaround requires an inner-shareable tlbi.
+ * We pick the reserved-ASID to minimise the impact.
+ */
+ __tlbi(aside1is, __TLBI_VADDR(0, 0));
+ dsb(ish);
+ }
+
ret = __flush_cache_user_range(start, start + chunk);
if (ret)
return ret;
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index 871c739f060a..9a9d98a443fc 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -154,14 +154,14 @@ static inline void sve_user_discard(void)
sve_user_disable();
}
-asmlinkage void el0_svc_handler(struct pt_regs *regs)
+void el0_svc_handler(struct pt_regs *regs)
{
sve_user_discard();
el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
}
#ifdef CONFIG_COMPAT
-asmlinkage void el0_svc_compat_handler(struct pt_regs *regs)
+void el0_svc_compat_handler(struct pt_regs *regs)
{
el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls,
compat_sys_call_table);
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 0b2946414dc9..73f06d4b3aae 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -30,6 +30,7 @@
#include <asm/thread_info.h>
#include <asm/stacktrace.h>
+#include <asm/paravirt.h>
unsigned long profile_pc(struct pt_regs *regs)
{
@@ -65,4 +66,6 @@ void __init time_init(void)
/* Calibrate the delay loop directly */
lpj_fine = arch_timer_rate / HZ;
+
+ pv_time_init();
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 34739e80211b..73caf35c2262 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -35,6 +35,7 @@
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/insn.h>
+#include <asm/kprobes.h>
#include <asm/traps.h>
#include <asm/smp.h>
#include <asm/stack_pointer.h>
@@ -393,7 +394,7 @@ void arm64_notify_segfault(unsigned long addr)
force_signal_inject(SIGSEGV, code, addr);
}
-asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
+void do_undefinstr(struct pt_regs *regs)
{
/* check for AArch32 breakpoint instructions */
if (!aarch32_break_handler(regs))
@@ -405,6 +406,7 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
BUG_ON(!user_mode(regs));
force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
}
+NOKPROBE_SYMBOL(do_undefinstr);
#define __user_cache_maint(insn, address, res) \
if (address >= user_addr_max()) { \
@@ -470,6 +472,15 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
int rt = ESR_ELx_SYS64_ISS_RT(esr);
unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0);
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
+ /* Hide DIC so that we can trap the unnecessary maintenance...*/
+ val &= ~BIT(CTR_DIC_SHIFT);
+
+ /* ... and fake IminLine to reduce the number of traps. */
+ val &= ~CTR_IMINLINE_MASK;
+ val |= (PAGE_SHIFT - 2) & CTR_IMINLINE_MASK;
+ }
+
pt_regs_write_reg(regs, rt, val);
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
@@ -667,7 +678,7 @@ static const struct sys64_hook cp15_64_hooks[] = {
{},
};
-asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs)
+void do_cp15instr(unsigned int esr, struct pt_regs *regs)
{
const struct sys64_hook *hook, *hook_base;
@@ -705,9 +716,10 @@ asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs)
*/
do_undefinstr(regs);
}
+NOKPROBE_SYMBOL(do_cp15instr);
#endif
-asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs)
+void do_sysinstr(unsigned int esr, struct pt_regs *regs)
{
const struct sys64_hook *hook;
@@ -724,6 +736,7 @@ asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs)
*/
do_undefinstr(regs);
}
+NOKPROBE_SYMBOL(do_sysinstr);
static const char *esr_class_str[] = {
[0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC",
@@ -793,7 +806,7 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
* bad_el0_sync handles unexpected, but potentially recoverable synchronous
* exceptions taken from EL0. Unlike bad_mode, this returns.
*/
-asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
+void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
{
void __user *pc = (void __user *)instruction_pointer(regs);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index aa76f7259668..841a8b4ce114 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -5,6 +5,8 @@
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
*/
+#define RO_EXCEPTION_TABLE_ALIGN 8
+
#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
#include <asm/kernel-pgtable.h>
@@ -111,9 +113,6 @@ SECTIONS
}
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
- __exception_text_start = .;
- *(.exception.text)
- __exception_text_end = .;
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
ENTRY_TEXT
@@ -135,11 +134,9 @@ SECTIONS
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */
- RO_DATA(PAGE_SIZE) /* everything from this point to */
- EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
- NOTES
+ /* everything from this point to __init_begin will be marked RO NX */
+ RO_DATA(PAGE_SIZE)
- . = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
@@ -215,7 +212,7 @@ SECTIONS
_data = .;
_sdata = .;
- RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
+ RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
/*
* Data written with the MMU off but read with the MMU on requires
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a67121d419a2..a475c68cbfec 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -21,6 +21,8 @@ if VIRTUALIZATION
config KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on OF
+ # for TASKSTATS/TASK_DELAY_ACCT:
+ depends on NET && MULTIUSER
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -39,6 +41,8 @@ config KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_VCPU_RUN_PID_CHANGE
+ select TASKSTATS
+ select TASK_DELAY_ACCT
---help---
Support hosting virtualized guest machines.
We don't support KVM with 16K page tables yet, due to the multiple
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3ac1a64d2fb9..5ffbdc39e780 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,6 +13,8 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o
kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dfd626447482..2fff06114a8f 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -34,6 +34,10 @@
#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
struct kvm_stats_debugfs_item debugfs_entries[] = {
+ VCPU_STAT(halt_successful_poll),
+ VCPU_STAT(halt_attempted_poll),
+ VCPU_STAT(halt_poll_invalid),
+ VCPU_STAT(halt_wakeup),
VCPU_STAT(hvc_exit_stat),
VCPU_STAT(wfe_exit_stat),
VCPU_STAT(wfi_exit_stat),
@@ -712,6 +716,12 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
+ /*
+ * We never return a pending ext_dabt here because we deliver it to
+ * the virtual CPU directly when setting the event and it's no longer
+ * 'pending' at this point.
+ */
+
return 0;
}
@@ -720,6 +730,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+ bool ext_dabt_pending = events->exception.ext_dabt_pending;
if (serror_pending && has_esr) {
if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
@@ -733,6 +744,9 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
kvm_inject_vabt(vcpu);
}
+ if (ext_dabt_pending)
+ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+
return 0;
}
@@ -858,6 +872,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_set_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_set_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -878,6 +895,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_get_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_get_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -898,6 +918,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_has_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_has_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 706cca23f0d2..aacfc55de44c 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -11,8 +11,6 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <kvm/arm_psci.h>
-
#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/kvm_asm.h>
@@ -22,6 +20,8 @@
#include <asm/debug-monitors.h>
#include <asm/traps.h>
+#include <kvm/arm_hypercalls.h>
+
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 799e84a40335..72fbbd86eb5e 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -12,7 +12,7 @@
#include <kvm/arm_psci.h>
-#include <asm/arch_gicv3.h>
+#include <asm/barrier.h>
#include <asm/cpufeature.h>
#include <asm/kprobes.h>
#include <asm/kvm_asm.h>
@@ -118,6 +118,20 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
+
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+ struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+
+ isb();
+ /*
+ * At this stage, and thanks to the above isb(), S2 is
+ * configured and enabled. We can now restore the guest's S1
+ * configuration: SCTLR, and only then TCR.
+ */
+ write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR);
+ isb();
+ write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR);
+ }
}
static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
@@ -159,6 +173,23 @@ static void __hyp_text __deactivate_traps_nvhe(void)
{
u64 mdcr_el2 = read_sysreg(mdcr_el2);
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+ u64 val;
+
+ /*
+ * Set the TCR and SCTLR registers in the exact opposite
+ * sequence as __activate_traps_nvhe (first prevent walks,
+ * then force the MMU on). A generous sprinkling of isb()
+ * ensure that things happen in this exact order.
+ */
+ val = read_sysreg_el1(SYS_TCR);
+ write_sysreg_el1(val | TCR_EPD1_MASK | TCR_EPD0_MASK, SYS_TCR);
+ isb();
+ val = read_sysreg_el1(SYS_SCTLR);
+ write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR);
+ isb();
+ }
+
__deactivate_traps_common();
mdcr_el2 &= MDCR_EL2_HPMN_MASK;
@@ -657,7 +688,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
*/
if (system_uses_irq_prio_masking()) {
gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
- dsb(sy);
+ pmr_sync();
}
vcpu = kern_hyp_va(vcpu);
@@ -670,18 +701,23 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
__sysreg_save_state_nvhe(host_ctxt);
- __activate_vm(kern_hyp_va(vcpu->kvm));
- __activate_traps(vcpu);
-
- __hyp_vgic_restore_state(vcpu);
- __timer_enable_traps(vcpu);
-
/*
* We must restore the 32-bit state before the sysregs, thanks
* to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
+ *
+ * Also, and in order to be able to deal with erratum #1319537 (A57)
+ * and #1319367 (A72), we must ensure that all VM-related sysreg are
+ * restored before we enable S2 translation.
*/
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
+
+ __activate_vm(kern_hyp_va(vcpu->kvm));
+ __activate_traps(vcpu);
+
+ __hyp_vgic_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
+
__debug_switch_to_guest(vcpu);
__set_guest_arch_workaround_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 7ddbc849b580..22b8128d19f6 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -117,12 +117,26 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
{
write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2);
write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1);
- write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR);
+
+ if (!cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+ write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR);
+ write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR);
+ } else if (!ctxt->__hyp_running_vcpu) {
+ /*
+ * Must only be done for guest registers, hence the context
+ * test. We're coming from the host, so SCTLR.M is already
+ * set. Pairs with __activate_traps_nvhe().
+ */
+ write_sysreg_el1((ctxt->sys_regs[TCR_EL1] |
+ TCR_EPD1_MASK | TCR_EPD0_MASK),
+ SYS_TCR);
+ isb();
+ }
+
write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1);
write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], SYS_CPACR);
write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], SYS_TTBR0);
write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], SYS_TTBR1);
- write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR);
write_sysreg_el1(ctxt->sys_regs[ESR_EL1], SYS_ESR);
write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1], SYS_AFSR0);
write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1], SYS_AFSR1);
@@ -135,6 +149,23 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1);
write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1);
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1319367) &&
+ ctxt->__hyp_running_vcpu) {
+ /*
+ * Must only be done for host registers, hence the context
+ * test. Pairs with __deactivate_traps_nvhe().
+ */
+ isb();
+ /*
+ * At this stage, and thanks to the above isb(), S2 is
+ * deconfigured and disabled. We can now restore the host's
+ * S1 configuration: SCTLR, and only then TCR.
+ */
+ write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR);
+ isb();
+ write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR);
+ }
+
write_sysreg(ctxt->gp_regs.sp_el1, sp_el1);
write_sysreg_el1(ctxt->gp_regs.elr_el1, SYS_ELR);
write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR);
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index eb0efc5557f3..c2bc17ca6430 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -63,6 +63,22 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm,
static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm,
struct tlb_inv_context *cxt)
{
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+ u64 val;
+
+ /*
+ * For CPUs that are affected by ARM 1319367, we need to
+ * avoid a host Stage-1 walk while we have the guest's
+ * VMID set in the VTTBR in order to invalidate TLBs.
+ * We're guaranteed that the S1 MMU is enabled, so we can
+ * simply set the EPD bits to avoid any further TLB fill.
+ */
+ val = cxt->tcr = read_sysreg_el1(SYS_TCR);
+ val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
+ write_sysreg_el1(val, SYS_TCR);
+ isb();
+ }
+
__load_guest_stage2(kvm);
isb();
}
@@ -100,6 +116,13 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm,
struct tlb_inv_context *cxt)
{
write_sysreg(0, vttbr_el2);
+
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+ /* Ensure write of the host VMID */
+ isb();
+ /* Restore the host's TCR_EL1 */
+ write_sysreg_el1(cxt->tcr, SYS_TCR);
+ }
}
static void __hyp_text __tlb_switch_to_host(struct kvm *kvm,
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index a9d25a305af5..ccdb6a051ab2 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -109,7 +109,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
/**
* kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
+ * @vcpu: The VCPU to receive the data abort
* @addr: The address to report in the DFAR
*
* It is assumed that this code is called from the VCPU thread and that the
@@ -125,7 +125,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
+ * @vcpu: The VCPU to receive the prefetch abort
* @addr: The address to report in the DFAR
*
* It is assumed that this code is called from the VCPU thread and that the
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 10415572e82f..aeafc03e961a 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -20,7 +20,6 @@
* Alignment fixed up by hardware.
*/
ENTRY(__arch_clear_user)
- uaccess_enable_not_uao x2, x3, x4
mov x2, x1 // save the size for fixup return
subs x1, x1, #8
b.mi 2f
@@ -40,7 +39,6 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
b.mi 5f
uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
5: mov x0, #0
- uaccess_disable_not_uao x2, x3
ret
ENDPROC(__arch_clear_user)
EXPORT_SYMBOL(__arch_clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 680e74409ff9..ebb3c06cbb5d 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -54,10 +54,8 @@
end .req x5
ENTRY(__arch_copy_from_user)
- uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
- uaccess_disable_not_uao x3, x4
mov x0, #0 // Nothing to copy
ret
ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 0bedae3f3792..3d8153a1ebce 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -56,10 +56,8 @@
end .req x5
ENTRY(__arch_copy_in_user)
- uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
- uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 2d88c736e8f2..357eae2c18eb 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -53,10 +53,8 @@
end .req x5
ENTRY(__arch_copy_to_user)
- uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
- uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index cbfcbe6470a5..bfa30b75b2b8 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -28,7 +28,11 @@ void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
unsigned long __copy_user_flushcache(void *to, const void __user *from,
unsigned long n)
{
- unsigned long rc = __arch_copy_from_user(to, from, n);
+ unsigned long rc;
+
+ uaccess_enable_not_uao();
+ rc = __arch_copy_from_user(to, from, n);
+ uaccess_disable_not_uao();
/* See above */
__clean_dcache_area_pop(to, n - rc);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9fc6db0bcbad..077b02a2d4d3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,7 +32,8 @@
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
-#include <asm/kasan.h>
+#include <asm/kprobes.h>
+#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
#include <asm/pgtable.h>
@@ -101,18 +102,6 @@ static void mem_abort_decode(unsigned int esr)
data_abort_decode(esr);
}
-static inline bool is_ttbr0_addr(unsigned long addr)
-{
- /* entry assembly clears tags for TTBR0 addrs */
- return addr < TASK_SIZE;
-}
-
-static inline bool is_ttbr1_addr(unsigned long addr)
-{
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
- return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
-}
-
static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
{
/* Either init_pg_dir or swapper_pg_dir */
@@ -318,6 +307,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
if (is_el1_permission_fault(addr, esr, regs)) {
if (esr & ESR_ELx_WNR)
msg = "write to read-only memory";
+ else if (is_el1_instruction_abort(esr))
+ msg = "execute from non-executable memory";
else
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
@@ -736,8 +727,7 @@ static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
};
-asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
- struct pt_regs *regs)
+void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);
@@ -753,43 +743,21 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
arm64_notify_die(inf->name, regs,
inf->sig, inf->code, (void __user *)addr, esr);
}
+NOKPROBE_SYMBOL(do_mem_abort);
-asmlinkage void __exception do_el0_irq_bp_hardening(void)
+void do_el0_irq_bp_hardening(void)
{
/* PC has already been checked in entry.S */
arm64_apply_bp_hardening();
}
+NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
-asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
- unsigned int esr,
- struct pt_regs *regs)
-{
- /*
- * We've taken an instruction abort from userspace and not yet
- * re-enabled IRQs. If the address is a kernel address, apply
- * BP hardening prior to enabling IRQs and pre-emption.
- */
- if (!is_ttbr0_addr(addr))
- arm64_apply_bp_hardening();
-
- local_daif_restore(DAIF_PROCCTX);
- do_mem_abort(addr, esr, regs);
-}
-
-
-asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
- unsigned int esr,
- struct pt_regs *regs)
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
- if (user_mode(regs)) {
- if (!is_ttbr0_addr(instruction_pointer(regs)))
- arm64_apply_bp_hardening();
- local_daif_restore(DAIF_PROCCTX);
- }
-
arm64_notify_die("SP/PC alignment exception", regs,
SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
}
+NOKPROBE_SYMBOL(do_sp_pc_abort);
int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs);
@@ -872,8 +840,7 @@ NOKPROBE_SYMBOL(debug_exception_exit);
#ifdef CONFIG_ARM64_ERRATUM_1463225
DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
-static int __exception
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
{
if (user_mode(regs))
return 0;
@@ -892,16 +859,15 @@ cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
return 1;
}
#else
-static int __exception
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
{
return 0;
}
#endif /* CONFIG_ARM64_ERRATUM_1463225 */
+NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler);
-asmlinkage void __exception do_debug_exception(unsigned long addr_if_watchpoint,
- unsigned int esr,
- struct pt_regs *regs)
+void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
+ struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_debug_fault_info(esr);
unsigned long pc = instruction_pointer(regs);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 45c00a54909c..be9481cdf3b9 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -20,6 +20,7 @@
#include <linux/sort.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
+#include <linux/dma-direct.h>
#include <linux/dma-mapping.h>
#include <linux/dma-contiguous.h>
#include <linux/efi.h>
@@ -41,6 +42,8 @@
#include <asm/tlb.h>
#include <asm/alternative.h>
+#define ARM64_ZONE_DMA_BITS 30
+
/*
* We need to be able to catch inadvertent references to memstart_addr
* that occur (potentially in generic code) before arm64_memblock_init()
@@ -56,7 +59,14 @@ EXPORT_SYMBOL(physvirt_offset);
struct page *vmemmap __ro_after_init;
EXPORT_SYMBOL(vmemmap);
+/*
+ * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of
+ * memory as some devices, namely the Raspberry Pi 4, have peripherals with
+ * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32
+ * bit addressable memory area.
+ */
phys_addr_t arm64_dma_phys_limit __ro_after_init;
+static phys_addr_t arm64_dma32_phys_limit __ro_after_init;
#ifdef CONFIG_KEXEC_CORE
/*
@@ -81,7 +91,7 @@ static void __init reserve_crashkernel(void)
if (crash_base == 0) {
/* Current arm64 boot protocol requires 2MB alignment */
- crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
+ crash_base = memblock_find_in_range(0, arm64_dma32_phys_limit,
crash_size, SZ_2M);
if (crash_base == 0) {
pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
@@ -169,15 +179,16 @@ static void __init reserve_elfcorehdr(void)
{
}
#endif /* CONFIG_CRASH_DUMP */
+
/*
- * Return the maximum physical address for ZONE_DMA32 (DMA_BIT_MASK(32)). It
- * currently assumes that for memory starting above 4G, 32-bit devices will
- * use a DMA offset.
+ * Return the maximum physical address for a zone with a given address size
+ * limit. It currently assumes that for memory starting above 4G, 32-bit
+ * devices will use a DMA offset.
*/
-static phys_addr_t __init max_zone_dma_phys(void)
+static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
{
- phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
- return min(offset + (1ULL << 32), memblock_end_of_DRAM());
+ phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits);
+ return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
}
#ifdef CONFIG_NUMA
@@ -186,8 +197,11 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
+#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys());
+ max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit);
#endif
max_zone_pfns[ZONE_NORMAL] = max;
@@ -200,16 +214,21 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
struct memblock_region *reg;
unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- unsigned long max_dma = min;
+ unsigned long max_dma32 = min;
+ unsigned long __maybe_unused max_dma = min;
memset(zone_size, 0, sizeof(zone_size));
- /* 4GB maximum for 32-bit only capable devices */
-#ifdef CONFIG_ZONE_DMA32
+#ifdef CONFIG_ZONE_DMA
max_dma = PFN_DOWN(arm64_dma_phys_limit);
- zone_size[ZONE_DMA32] = max_dma - min;
+ zone_size[ZONE_DMA] = max_dma - min;
+ max_dma32 = max_dma;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
+ zone_size[ZONE_DMA32] = max_dma32 - max_dma;
#endif
- zone_size[ZONE_NORMAL] = max - max_dma;
+ zone_size[ZONE_NORMAL] = max - max_dma32;
memcpy(zhole_size, zone_size, sizeof(zhole_size));
@@ -219,16 +238,22 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
if (start >= max)
continue;
-
-#ifdef CONFIG_ZONE_DMA32
+#ifdef CONFIG_ZONE_DMA
if (start < max_dma) {
- unsigned long dma_end = min(end, max_dma);
- zhole_size[ZONE_DMA32] -= dma_end - start;
+ unsigned long dma_end = min_not_zero(end, max_dma);
+ zhole_size[ZONE_DMA] -= dma_end - start;
}
#endif
- if (end > max_dma) {
+#ifdef CONFIG_ZONE_DMA32
+ if (start < max_dma32) {
+ unsigned long dma32_end = min(end, max_dma32);
+ unsigned long dma32_start = max(start, max_dma);
+ zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
+ }
+#endif
+ if (end > max_dma32) {
unsigned long normal_end = min(end, max);
- unsigned long normal_start = max(start, max_dma);
+ unsigned long normal_start = max(start, max_dma32);
zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
}
}
@@ -418,11 +443,15 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
- /* 4GB maximum for 32-bit only capable devices */
+ if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ zone_dma_bits = ARM64_ZONE_DMA_BITS;
+ arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS);
+ }
+
if (IS_ENABLED(CONFIG_ZONE_DMA32))
- arm64_dma_phys_limit = max_zone_dma_phys();
+ arm64_dma32_phys_limit = max_zone_phys(32);
else
- arm64_dma_phys_limit = PHYS_MASK + 1;
+ arm64_dma32_phys_limit = PHYS_MASK + 1;
reserve_crashkernel();
@@ -430,7 +459,7 @@ void __init arm64_memblock_init(void)
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
- dma_contiguous_reserve(arm64_dma_phys_limit);
+ dma_contiguous_reserve(arm64_dma32_phys_limit);
}
void __init bootmem_init(void)
@@ -534,7 +563,7 @@ static void __init free_unused_memmap(void)
void __init mem_init(void)
{
if (swiotlb_force == SWIOTLB_FORCE ||
- max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
+ max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit))
swiotlb_init(1);
else
swiotlb_force = SWIOTLB_NO_FORCE;
@@ -571,7 +600,7 @@ void free_initmem(void)
{
free_reserved_area(lm_alias(__init_begin),
lm_alias(__init_end),
- 0, "unused kernel");
+ POISON_FREE_INITMEM, "unused kernel");
/*
* Unmap the __init region but leave the VM area in place. This
* prevents the region from being reused for kernel modules, which
@@ -580,18 +609,6 @@ void free_initmem(void)
unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- unsigned long aligned_start, aligned_end;
-
- aligned_start = __virt_to_phys(start) & PAGE_MASK;
- aligned_end = PAGE_ALIGN(__virt_to_phys(end));
- memblock_free(aligned_start, aligned_end - aligned_start);
- free_reserved_area((void *)start, (void *)end, 0, "initrd");
-}
-#endif
-
/*
* Dump out memory limit information on panic.
*/
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 60c929f3683b..5a3b15a14a7f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -338,7 +338,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
phys_addr_t (*pgtable_alloc)(int),
int flags)
{
- unsigned long addr, length, end, next;
+ unsigned long addr, end, next;
pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
/*
@@ -350,9 +350,8 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
- length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
+ end = PAGE_ALIGN(virt + size);
- end = addr + length;
do {
next = pgd_addr_end(addr, end);
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
@@ -1061,6 +1060,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
+ memblock_clear_nomap(start, size);
+
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
restrictions);
}