summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/Kconfig.debug3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/kaslr_64.c5
-rw-r--r--arch/x86/crypto/Makefile25
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S749
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c407
-rw-r--r--arch/x86/crypto/aegis128l-aesni-asm.S825
-rw-r--r--arch/x86/crypto/aegis128l-aesni-glue.c407
-rw-r--r--arch/x86/crypto/aegis256-aesni-asm.S702
-rw-r--r--arch/x86/crypto/aegis256-aesni-glue.c407
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/crypto/morus1280-avx2-asm.S621
-rw-r--r--arch/x86/crypto/morus1280-avx2-glue.c68
-rw-r--r--arch/x86/crypto/morus1280-sse2-asm.S895
-rw-r--r--arch/x86/crypto/morus1280-sse2-glue.c68
-rw-r--r--arch/x86/crypto/morus1280_glue.c302
-rw-r--r--arch/x86/crypto/morus640-sse2-asm.S614
-rw-r--r--arch/x86/crypto/morus640-sse2-glue.c68
-rw-r--r--arch/x86/crypto/morus640_glue.c298
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S938
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S805
-rw-r--r--arch/x86/crypto/salsa20_glue.c91
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/mcsafe_test.h75
-rw-r--r--arch/x86/include/asm/mmu_context.h15
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nospec-branch.h41
-rw-r--r--arch/x86/include/asm/page_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/pkeys.h13
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/x86_init.h1
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c45
-rw-r--r--arch/x86/kernel/cpu/bugs.c13
-rw-r--r--arch/x86/kernel/cpu/common.c9
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c33
-rw-r--r--arch/x86/kernel/i8237.c25
-rw-r--r--arch/x86/kernel/idt.c7
-rw-r--r--arch/x86/kernel/platform-quirks.c7
-rw-r--r--arch/x86/kernel/setup.c8
-rw-r--r--arch/x86/kvm/cpuid.c10
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/lib/memcpy_64.S10
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c3
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/x86/net/Makefile7
-rw-r--r--arch/x86/net/bpf_jit.S154
-rw-r--r--arch/x86/net/bpf_jit_comp.c144
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2419
-rw-r--r--arch/x86/pci/early.c19
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/platform/uv/uv_irq.c7
-rw-r--r--arch/x86/um/Kconfig10
-rw-r--r--arch/x86/xen/mmu.c60
-rw-r--r--arch/x86/xen/xen-pvh.S47
62 files changed, 9260 insertions, 2292 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c320ce005b02..297789aef9fa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
# Select 32 or 64 bit
config 64BIT
- bool "64-bit kernel" if ARCH = "x86"
- default ARCH != "i386"
+ bool "64-bit kernel" if "$(ARCH)" = "x86"
+ default "$(ARCH)" != "i386"
---help---
Say yes to build a 64-bit kernel - formerly known as x86_64
Say no to build a 32-bit kernel - formerly known as i386
@@ -60,6 +60,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_UACCESS_MCSAFE if X86_64
@@ -140,7 +141,7 @@ config X86
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_EBPF_JIT if X86_64
+ select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
@@ -334,6 +335,9 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config DYNAMIC_PHYSICAL_MASK
+ bool
+
config PGTABLE_LEVELS
int
default 5 if X86_5LEVEL
@@ -1486,6 +1490,7 @@ config ARCH_HAS_MEM_ENCRYPT
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
+ select DYNAMIC_PHYSICAL_MASK
---help---
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 192e4d2f9efc..c6dd1d980081 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
You should normally say N here, unless you want to debug early
crashes or need a very simple printk logging facility.
+config MCSAFE_TEST
+ def_bool n
+
config X86_PTDUMP_CORE
def_bool n
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 60135cbd905c..f0a6ea22429d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -94,7 +94,7 @@ ifeq ($(CONFIG_X86_32),y)
else
BITS := 64
UTS_MACHINE := x86_64
- CHECKFLAGS += -D__x86_64__ -m64
+ CHECKFLAGS += -D__x86_64__
biarch := -m64
KBUILD_AFLAGS += -m64
diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c
index 522d11431433..748456c365f4 100644
--- a/arch/x86/boot/compressed/kaslr_64.c
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -69,6 +69,8 @@ static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
static unsigned long top_level_pgt;
+phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+
/*
* Mapping information structure passed to kernel_ident_mapping_init().
* Due to relocation, pointers must be assigned at run time not build time.
@@ -81,6 +83,9 @@ void initialize_identity_maps(void)
/* If running as an SEV guest, the encryption mask is required. */
set_sev_encryption_mask();
+ /* Exclude the encryption mask from __PHYSICAL_MASK */
+ physical_mask &= ~sme_me_mask;
+
/* Init mapping_info with run-time function/buffer pointers. */
mapping_info.alloc_pgt_page = alloc_pgt_page;
mapping_info.context = &pgt_data;
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5f07333bb224..a450ad573dcb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -38,6 +36,16 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o
+obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
+obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -55,11 +63,12 @@ ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
+
+ obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -68,10 +77,16 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
+aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
+
+morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
+morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
@@ -87,6 +102,8 @@ ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+
+ morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 000000000000..9254e0b6cc06
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,749 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+.align 16
+.Laegis128_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state (shifted positions)
+ * changed:
+ * T0
+ */
+.macro aegis128_update
+ movdqa STATE4, T0
+ aesenc STATE0, STATE4
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc T0, STATE3
+.endm
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ pxor MSG, MSG
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128_aesni_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ movdqu (%rdx), T1
+
+ /* load key: */
+ movdqa (%rsi), KEY
+ pxor KEY, T1
+ movdqa T1, STATE0
+ movdqa KEY, STATE3
+ movdqa KEY, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128_const_0, STATE2
+ movdqa .Laegis128_const_1, STATE1
+ pxor STATE2, STATE3
+ pxor STATE1, STATE4
+
+ /* update 10 times with KEY / KEY xor IV: */
+ aegis128_update; pxor KEY, STATE4
+ aegis128_update; pxor T1, STATE3
+ aegis128_update; pxor KEY, STATE2
+ aegis128_update; pxor T1, STATE1
+ aegis128_update; pxor KEY, STATE0
+ aegis128_update; pxor T1, STATE4
+ aegis128_update; pxor KEY, STATE3
+ aegis128_update; pxor T1, STATE2
+ aegis128_update; pxor KEY, STATE1
+ aegis128_update; pxor T1, STATE0
+
+ /* store the state: */
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_init)
+
+/*
+ * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis128_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ movdqa 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqa 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqa 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqa 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqa 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ movdqu 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqu 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqu 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqu 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqu 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_u_loop
+
+ /* store the state: */
+.Lad_out_0:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_ad)
+
+.macro encrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ pxor \s1, T0
+ pxor \s4, T0
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, T0
+ movdq\a T0, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lenc_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_u_loop
+
+ /* store the state: */
+.Lenc_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_enc)
+
+/*
+ * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ pxor STATE1, T0
+ pxor STATE4, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ call __store_partial
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ENDPROC(crypto_aegis128_aesni_enc_tail)
+
+.macro decrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ pxor \s1, MSG
+ pxor \s4, MSG
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, MSG
+ movdq\a MSG, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_u_loop
+
+ /* store the state: */
+.Ldec_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec)
+
+/*
+ * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE1, MSG
+ pxor STATE4, MSG
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, MSG
+
+ movdqa MSG, T0
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Laegis128_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128_aesni_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+ aegis128_update; pxor MSG, STATE2
+ aegis128_update; pxor MSG, STATE1
+ aegis128_update; pxor MSG, STATE0
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 000000000000..5de7c0d46edf
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis128_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS128_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis128_aesni_ad(state,
+ AEGIS128_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis128_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+ left &= AEGIS128_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+ crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis128_aesni_process_crypt(
+ struct aegis_state *state, struct aead_request *req,
+ const struct aegis_crypt_ops *ops)
+{
+ struct skcipher_walk walk;
+ u8 *src, *dst;
+ unsigned int chunksize, base;
+
+ ops->skcipher_walk_init(&walk, req, false);
+
+ while (walk.nbytes) {
+ src = walk.src.virt.addr;
+ dst = walk.dst.virt.addr;
+ chunksize = walk.nbytes;
+
+ ops->crypt_blocks(state, chunksize, src, dst);
+
+ base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
+ src += base;
+ dst += base;
+ chunksize &= AEGIS128_BLOCK_SIZE - 1;
+
+ if (chunksize > 0)
+ ops->crypt_tail(state, chunksize, src, dst);
+
+ skcipher_walk_done(&walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+ if (keylen != AEGIS128_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct aegis_state state;
+
+ kernel_fpu_begin();
+
+ crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis128_aesni_process_crypt(&state, req, ops);
+ crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis128_aesni_enc,
+ .crypt_tail = crypto_aegis128_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis128_aesni_dec,
+ .crypt_tail = crypto_aegis128_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis128_aesni_setkey,
+ .setauthsize = crypto_aegis128_aesni_setauthsize,
+ .encrypt = crypto_aegis128_aesni_encrypt,
+ .decrypt = crypto_aegis128_aesni_decrypt,
+ .init = crypto_aegis128_aesni_init_tfm,
+ .exit = crypto_aegis128_aesni_exit_tfm,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis128",
+ .cra_driver_name = "__aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis128_aesni_setkey,
+ .setauthsize = cryptd_aegis128_aesni_setauthsize,
+ .encrypt = cryptd_aegis128_aesni_encrypt,
+ .decrypt = cryptd_aegis128_aesni_decrypt,
+ .init = cryptd_aegis128_aesni_init_tfm,
+ .exit = cryptd_aegis128_aesni_exit_tfm,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis128",
+ .cra_driver_name = "aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AES),
+ X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+ if (!x86_match_cpu(aesni_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis128_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis128_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
new file mode 100644
index 000000000000..9263c344f2c7
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -0,0 +1,825 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define STATE6 %xmm6
+#define STATE7 %xmm7
+#define MSG0 %xmm8
+#define MSG1 %xmm9
+#define T0 %xmm10
+#define T1 %xmm11
+#define T2 %xmm12
+#define T3 %xmm13
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
+.align 16
+.Laegis128l_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128l_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
+.align 16
+.Laegis128l_counter0:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Laegis128l_counter1:
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG0 - first message block
+ * MSG1 - second message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ pxor MSG0, MSG0
+ pxor MSG1, MSG1
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG0
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG0
+ movq (%r8), T0
+ pxor T0, MSG0
+
+.Lld_partial_8:
+ mov LEN, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ movdqa MSG0, MSG1
+ movdqu (SRC), MSG0
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - first message block
+ * T1 - second message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0, (%r9)
+ movdqa T1, T0
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+.macro update
+ movdqa STATE7, T0
+ aesenc STATE0, STATE7
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc STATE4, STATE3
+ aesenc STATE5, STATE4
+ aesenc STATE6, STATE5
+ aesenc T0, STATE6
+.endm
+
+.macro update0
+ update
+ pxor MSG0, STATE7
+ pxor MSG1, STATE3
+.endm
+
+.macro update1
+ update
+ pxor MSG0, STATE6
+ pxor MSG1, STATE2
+.endm
+
+.macro update2
+ update
+ pxor MSG0, STATE5
+ pxor MSG1, STATE1
+.endm
+
+.macro update3
+ update
+ pxor MSG0, STATE4
+ pxor MSG1, STATE0
+.endm
+
+.macro update4
+ update
+ pxor MSG0, STATE3
+ pxor MSG1, STATE7
+.endm
+
+.macro update5
+ update
+ pxor MSG0, STATE2
+ pxor MSG1, STATE6
+.endm
+
+.macro update6
+ update
+ pxor MSG0, STATE1
+ pxor MSG1, STATE5
+.endm
+
+.macro update7
+ update
+ pxor MSG0, STATE0
+ pxor MSG1, STATE4
+.endm
+
+.macro state_load
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+ movdqu 0x50(STATEP), STATE5
+ movdqu 0x60(STATEP), STATE6
+ movdqu 0x70(STATEP), STATE7
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
+ movdqu \s7, 0x00(STATEP)
+ movdqu \s0, 0x10(STATEP)
+ movdqu \s1, 0x20(STATEP)
+ movdqu \s2, 0x30(STATEP)
+ movdqu \s3, 0x40(STATEP)
+ movdqu \s4, 0x50(STATEP)
+ movdqu \s5, 0x60(STATEP)
+ movdqu \s6, 0x70(STATEP)
+.endm
+
+.macro state_store0
+ state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro state_store1
+ state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro state_store2
+ state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store3
+ state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store4
+ state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store5
+ state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store6
+ state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro state_store7
+ state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128l_aesni_init)
+ FRAME_BEGIN
+
+ /* load key: */
+ movdqa (%rsi), MSG1
+ movdqa MSG1, STATE0
+ movdqa MSG1, STATE4
+ movdqa MSG1, STATE5
+ movdqa MSG1, STATE6
+ movdqa MSG1, STATE7
+
+ /* load IV: */
+ movdqu (%rdx), MSG0
+ pxor MSG0, STATE0
+ pxor MSG0, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128l_const_0, STATE2
+ movdqa .Laegis128l_const_1, STATE1
+ movdqa STATE1, STATE3
+ pxor STATE2, STATE5
+ pxor STATE1, STATE6
+ pxor STATE2, STATE7
+
+ /* update 10 times with IV and KEY: */
+ update0
+ update1
+ update2
+ update3
+ update4
+ update5
+ update6
+ update7
+ update0
+ update1
+
+ state_store1
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_init)
+
+.macro ad_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ update\i
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis128l_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Lad_out
+
+ state_load
+
+ mov SRC, %r8
+ and $0xf, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ ad_block a 0
+ ad_block a 1
+ ad_block a 2
+ ad_block a 3
+ ad_block a 4
+ ad_block a 5
+ ad_block a 6
+ ad_block a 7
+
+ add $0x100, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ ad_block u 0
+ ad_block u 1
+ ad_block u 2
+ ad_block u 3
+ ad_block u 4
+ ad_block u 5
+ ad_block u 6
+ ad_block u 7
+
+ add $0x100, SRC
+ jmp .Lad_u_loop
+
+.Lad_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lad_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lad_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Lad_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_ad)
+
+.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
+ pxor \s1, \m0
+ pxor \s6, \m0
+ movdqa \s2, T3
+ pand \s3, T3
+ pxor T3, \m0
+
+ pxor \s2, \m1
+ pxor \s5, \m1
+ movdqa \s6, T3
+ pand \s7, T3
+ pxor T3, \m1
+.endm
+
+.macro crypt0 m0 m1
+ crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro crypt1 m0 m1
+ crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro crypt2 m0 m1
+ crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt3 m0 m1
+ crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt4 m0 m1
+ crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt5 m0 m1
+ crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt6 m0 m1
+ crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro crypt7 m0 m1
+ crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+.macro encrypt_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ crypt\i T0, T1
+ movdq\a T0, (\i * 0x20 + 0x00)(DST)
+ movdq\a T1, (\i * 0x20 + 0x10)(DST)
+
+ update\i
+
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ crypt\i MSG0, MSG1
+ movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
+ movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
+
+ update\i
+
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Lenc_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xf, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a 0
+ encrypt_block a 1
+ encrypt_block a 2
+ encrypt_block a 3
+ encrypt_block a 4
+ encrypt_block a 5
+ encrypt_block a 6
+ encrypt_block a 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u 0
+ encrypt_block u 1
+ encrypt_block u 2
+ encrypt_block u 3
+ encrypt_block u 4
+ encrypt_block u 5
+ encrypt_block u 6
+ encrypt_block u 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Lenc_u_loop
+
+.Lenc_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lenc_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lenc_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Lenc_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_enc)
+
+/*
+ * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ crypt0 T0, T1
+
+ call __store_partial
+
+ update0
+
+ state_store0
+
+ FRAME_END
+ENDPROC(crypto_aegis128l_aesni_enc_tail)
+
+/*
+ * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Ldec_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a 0
+ decrypt_block a 1
+ decrypt_block a 2
+ decrypt_block a 3
+ decrypt_block a 4
+ decrypt_block a 5
+ decrypt_block a 6
+ decrypt_block a 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u 0
+ decrypt_block u 1
+ decrypt_block u 2
+ decrypt_block u 3
+ decrypt_block u 4
+ decrypt_block u 5
+ decrypt_block u 6
+ decrypt_block u 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Ldec_u_loop
+
+.Ldec_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Ldec_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Ldec_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Ldec_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_dec)
+
+/*
+ * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* decrypt message: */
+ call __load_partial
+
+ crypt0 MSG0, MSG1
+
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa T0, T1
+ movdqa .Laegis128l_counter0, T2
+ movdqa .Laegis128l_counter1, T3
+ pcmpgtb T2, T0
+ pcmpgtb T3, T1
+ pand T0, MSG0
+ pand T1, MSG1
+
+ update0
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128l_aesni_final)
+ FRAME_BEGIN
+
+ state_load
+
+ /* prepare length block: */
+ movq %rdx, MSG0
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG0
+ psllq $3, MSG0 /* multiply by 8 (to get bit count) */
+
+ pxor STATE2, MSG0
+ movdqa MSG0, MSG1
+
+ /* update state: */
+ update0
+ update1
+ update2
+ update3
+ update4
+ update5
+ update6
+
+ /* xor tag: */
+ movdqu (%rsi), T0
+
+ pxor STATE1, T0
+ pxor STATE2, T0
+ pxor STATE3, T0
+ pxor STATE4, T0
+ pxor STATE5, T0
+ pxor STATE6, T0
+ pxor STATE7, T0
+
+ movdqu T0, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_final)
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
new file mode 100644
index 000000000000..876e4866e633
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128L Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128L_BLOCK_ALIGN 16
+#define AEGIS128L_BLOCK_SIZE 32
+#define AEGIS128L_NONCE_SIZE 16
+#define AEGIS128L_STATE_BLOCKS 8
+#define AEGIS128L_KEY_SIZE 16
+#define AEGIS128L_MIN_AUTH_SIZE 8
+#define AEGIS128L_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128l_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128l_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128L_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis128l_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS128L_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128L_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis128l_aesni_ad(state,
+ AEGIS128L_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis128l_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS128L_BLOCK_SIZE - 1);
+ left &= AEGIS128L_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos);
+ crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis128l_aesni_process_crypt(
+ struct aegis_state *state, struct aead_request *req,
+ const struct aegis_crypt_ops *ops)
+{
+ struct skcipher_walk walk;
+ u8 *src, *dst;
+ unsigned int chunksize, base;
+
+ ops->skcipher_walk_init(&walk, req, false);
+
+ while (walk.nbytes) {
+ src = walk.src.virt.addr;
+ dst = walk.dst.virt.addr;
+ chunksize = walk.nbytes;
+
+ ops->crypt_blocks(state, chunksize, src, dst);
+
+ base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
+ src += base;
+ dst += base;
+ chunksize &= AEGIS128L_BLOCK_SIZE - 1;
+
+ if (chunksize > 0)
+ ops->crypt_tail(state, chunksize, src, dst);
+
+ skcipher_walk_done(&walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead);
+
+ if (keylen != AEGIS128L_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128L_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128L_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+ struct aegis_state state;
+
+ kernel_fpu_begin();
+
+ crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
+ crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis128l_aesni_process_crypt(&state, req, ops);
+ crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis128l_aesni_enc,
+ .crypt_tail = crypto_aegis128l_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis128l_aesni_dec,
+ .crypt_tail = crypto_aegis128l_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128l_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis128l_aesni_setkey,
+ .setauthsize = crypto_aegis128l_aesni_setauthsize,
+ .encrypt = crypto_aegis128l_aesni_encrypt,
+ .decrypt = crypto_aegis128l_aesni_decrypt,
+ .init = crypto_aegis128l_aesni_init_tfm,
+ .exit = crypto_aegis128l_aesni_exit_tfm,
+
+ .ivsize = AEGIS128L_NONCE_SIZE,
+ .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128L_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis128l",
+ .cra_driver_name = "__aegis128l-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis128l_aesni_setkey,
+ .setauthsize = cryptd_aegis128l_aesni_setauthsize,
+ .encrypt = cryptd_aegis128l_aesni_encrypt,
+ .decrypt = cryptd_aegis128l_aesni_decrypt,
+ .init = cryptd_aegis128l_aesni_init_tfm,
+ .exit = cryptd_aegis128l_aesni_exit_tfm,
+
+ .ivsize = AEGIS128L_NONCE_SIZE,
+ .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128L_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis128l",
+ .cra_driver_name = "aegis128l-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AES),
+ X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128l_aesni_module_init(void)
+{
+ if (!x86_match_cpu(aesni_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis128l_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+static void __exit crypto_aegis128l_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+module_init(crypto_aegis128l_aesni_module_init);
+module_exit(crypto_aegis128l_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128l");
+MODULE_ALIAS_CRYPTO("aegis128l-aesni");
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
new file mode 100644
index 000000000000..1d977d515bf9
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -0,0 +1,702 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define MSG %xmm6
+#define T0 %xmm7
+#define T1 %xmm8
+#define T2 %xmm9
+#define T3 %xmm10
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
+.align 16
+.Laegis256_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis256_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
+.align 16
+.Laegis256_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ pxor MSG, MSG
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+.macro update
+ movdqa STATE5, T0
+ aesenc STATE0, STATE5
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc STATE4, STATE3
+ aesenc T0, STATE4
+.endm
+
+.macro update0 m
+ update
+ pxor \m, STATE5
+.endm
+
+.macro update1 m
+ update
+ pxor \m, STATE4
+.endm
+
+.macro update2 m
+ update
+ pxor \m, STATE3
+.endm
+
+.macro update3 m
+ update
+ pxor \m, STATE2
+.endm
+
+.macro update4 m
+ update
+ pxor \m, STATE1
+.endm
+
+.macro update5 m
+ update
+ pxor \m, STATE0
+.endm
+
+.macro state_load
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+ movdqu 0x50(STATEP), STATE5
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5
+ movdqu \s5, 0x00(STATEP)
+ movdqu \s0, 0x10(STATEP)
+ movdqu \s1, 0x20(STATEP)
+ movdqu \s2, 0x30(STATEP)
+ movdqu \s3, 0x40(STATEP)
+ movdqu \s4, 0x50(STATEP)
+.endm
+
+.macro state_store0
+ state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store1
+ state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store2
+ state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store3
+ state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store4
+ state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro state_store5
+ state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+/*
+ * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis256_aesni_init)
+ FRAME_BEGIN
+
+ /* load key: */
+ movdqa 0x00(%rsi), MSG
+ movdqa 0x10(%rsi), T1
+ movdqa MSG, STATE4
+ movdqa T1, STATE5
+
+ /* load IV: */
+ movdqu 0x00(%rdx), T2
+ movdqu 0x10(%rdx), T3
+ pxor MSG, T2
+ pxor T1, T3
+ movdqa T2, STATE0
+ movdqa T3, STATE1
+
+ /* load the constants: */
+ movdqa .Laegis256_const_0, STATE3
+ movdqa .Laegis256_const_1, STATE2
+ pxor STATE3, STATE4
+ pxor STATE2, STATE5
+
+ /* update 10 times with IV and KEY: */
+ update0 MSG
+ update1 T1
+ update2 T2
+ update3 T3
+ update4 MSG
+ update5 T1
+ update0 T2
+ update1 T3
+ update2 MSG
+ update3 T1
+ update4 T2
+ update5 T3
+ update0 MSG
+ update1 T1
+ update2 T2
+ update3 T3
+
+ state_store3
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_init)
+
+.macro ad_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ update\i MSG
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis256_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lad_out
+
+ state_load
+
+ mov SRC, %r8
+ and $0xf, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ ad_block a 0
+ ad_block a 1
+ ad_block a 2
+ ad_block a 3
+ ad_block a 4
+ ad_block a 5
+
+ add $0x60, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ ad_block u 0
+ ad_block u 1
+ ad_block u 2
+ ad_block u 3
+ ad_block u 4
+ ad_block u 5
+
+ add $0x60, SRC
+ jmp .Lad_u_loop
+
+.Lad_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lad_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_ad)
+
+.macro crypt m s0 s1 s2 s3 s4 s5
+ pxor \s1, \m
+ pxor \s4, \m
+ pxor \s5, \m
+ movdqa \s2, T3
+ pand \s3, T3
+ pxor T3, \m
+.endm
+
+.macro crypt0 m
+ crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt1 m
+ crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt2 m
+ crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt3 m
+ crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt4 m
+ crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro crypt5 m
+ crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+.macro encrypt_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ crypt\i T0
+ movdq\a T0, (\i * 0x10)(DST)
+
+ update\i MSG
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ crypt\i MSG
+ movdq\a MSG, (\i * 0x10)(DST)
+
+ update\i MSG
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lenc_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xf, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a 0
+ encrypt_block a 1
+ encrypt_block a 2
+ encrypt_block a 3
+ encrypt_block a 4
+ encrypt_block a 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u 0
+ encrypt_block u 1
+ encrypt_block u 2
+ encrypt_block u 3
+ encrypt_block u 4
+ encrypt_block u 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Lenc_u_loop
+
+.Lenc_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lenc_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_enc)
+
+/*
+ * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ crypt0 T0
+
+ call __store_partial
+
+ update0 MSG
+
+ state_store0
+
+ FRAME_END
+ENDPROC(crypto_aegis256_aesni_enc_tail)
+
+/*
+ * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Ldec_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a 0
+ decrypt_block a 1
+ decrypt_block a 2
+ decrypt_block a 3
+ decrypt_block a 4
+ decrypt_block a 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u 0
+ decrypt_block u 1
+ decrypt_block u 2
+ decrypt_block u 3
+ decrypt_block u 4
+ decrypt_block u 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Ldec_u_loop
+
+.Ldec_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Ldec_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_dec)
+
+/*
+ * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* decrypt message: */
+ call __load_partial
+
+ crypt0 MSG
+
+ movdqa MSG, T0
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Laegis256_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ update0 MSG
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_dec_tail)
+
+/*
+ * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis256_aesni_final)
+ FRAME_BEGIN
+
+ state_load
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ update0 MSG
+ update1 MSG
+ update2 MSG
+ update3 MSG
+ update4 MSG
+ update5 MSG
+ update0 MSG
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+ pxor STATE5, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_final)
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
new file mode 100644
index 000000000000..2b5dd3af8f4d
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-256 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS256_BLOCK_ALIGN 16
+#define AEGIS256_BLOCK_SIZE 16
+#define AEGIS256_NONCE_SIZE 32
+#define AEGIS256_STATE_BLOCKS 6
+#define AEGIS256_KEY_SIZE 32
+#define AEGIS256_MIN_AUTH_SIZE 8
+#define AEGIS256_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis256_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis256_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS256_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE];
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis256_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS256_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS256_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis256_aesni_ad(state,
+ AEGIS256_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis256_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS256_BLOCK_SIZE - 1);
+ left &= AEGIS256_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos);
+ crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis256_aesni_process_crypt(
+ struct aegis_state *state, struct aead_request *req,
+ const struct aegis_crypt_ops *ops)
+{
+ struct skcipher_walk walk;
+ u8 *src, *dst;
+ unsigned int chunksize, base;
+
+ ops->skcipher_walk_init(&walk, req, false);
+
+ while (walk.nbytes) {
+ src = walk.src.virt.addr;
+ dst = walk.dst.virt.addr;
+ chunksize = walk.nbytes;
+
+ ops->crypt_blocks(state, chunksize, src, dst);
+
+ base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
+ src += base;
+ dst += base;
+ chunksize &= AEGIS256_BLOCK_SIZE - 1;
+
+ if (chunksize > 0)
+ ops->crypt_tail(state, chunksize, src, dst);
+
+ skcipher_walk_done(&walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead);
+
+ if (keylen != AEGIS256_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key, key, AEGIS256_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS256_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS256_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis256_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+ struct aegis_state state;
+
+ kernel_fpu_begin();
+
+ crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
+ crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis256_aesni_process_crypt(&state, req, ops);
+ crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis256_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis256_aesni_enc,
+ .crypt_tail = crypto_aegis256_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis256_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis256_aesni_dec,
+ .crypt_tail = crypto_aegis256_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis256_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis256_aesni_setkey,
+ .setauthsize = crypto_aegis256_aesni_setauthsize,
+ .encrypt = crypto_aegis256_aesni_encrypt,
+ .decrypt = crypto_aegis256_aesni_decrypt,
+ .init = crypto_aegis256_aesni_init_tfm,
+ .exit = crypto_aegis256_aesni_exit_tfm,
+
+ .ivsize = AEGIS256_NONCE_SIZE,
+ .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+ .chunksize = AEGIS256_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis256",
+ .cra_driver_name = "__aegis256-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis256_aesni_setkey,
+ .setauthsize = cryptd_aegis256_aesni_setauthsize,
+ .encrypt = cryptd_aegis256_aesni_encrypt,
+ .decrypt = cryptd_aegis256_aesni_decrypt,
+ .init = cryptd_aegis256_aesni_init_tfm,
+ .exit = cryptd_aegis256_aesni_exit_tfm,
+
+ .ivsize = AEGIS256_NONCE_SIZE,
+ .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+ .chunksize = AEGIS256_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis256",
+ .cra_driver_name = "aegis256-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AES),
+ X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis256_aesni_module_init(void)
+{
+ if (!x86_match_cpu(aesni_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis256_aesni_alg,
+ ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+static void __exit crypto_aegis256_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis256_aesni_alg,
+ ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+module_init(crypto_aegis256_aesni_module_init);
+module_exit(crypto_aegis256_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis256");
+MODULE_ALIAS_CRYPTO("aegis256-aesni");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 0420bab19efb..2ddbe3a1868b 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -364,5 +364,5 @@ module_exit(ghash_pclmulqdqni_mod_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
- "acclerated by PCLMULQDQ-NI");
+ "accelerated by PCLMULQDQ-NI");
MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
new file mode 100644
index 000000000000..37d422e77931
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -0,0 +1,621 @@
+/*
+ * AVX2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0 %ymm0
+#define STATE0_LOW %xmm0
+#define STATE1 %ymm1
+#define STATE2 %ymm2
+#define STATE3 %ymm3
+#define STATE4 %ymm4
+#define KEY %ymm5
+#define MSG %ymm5
+#define MSG_LOW %xmm5
+#define T0 %ymm6
+#define T0_LOW %xmm6
+#define T1 %ymm7
+
+.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
+.align 32
+.Lmorus1280_const:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
+.align 32
+.Lmorus1280_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro morus1280_round s0, s1, s2, s3, s4, b, w
+ vpand \s1, \s2, T0
+ vpxor T0, \s0, \s0
+ vpxor \s3, \s0, \s0
+ vpsllq $\b, \s0, T0
+ vpsrlq $(64 - \b), \s0, \s0
+ vpxor T0, \s0, \s0
+ vpermq $\w, \s3, \s3
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update:
+ morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+ vpxor MSG, STATE1, STATE1
+ morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+ vpxor MSG, STATE2, STATE2
+ morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+ vpxor MSG, STATE3, STATE3
+ morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
+ vpxor MSG, STATE4, STATE4
+ morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
+ ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update_zero:
+ morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+ morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+ morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+ morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
+ morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
+ ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ vpxor MSG, MSG, MSG
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG_LOW
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pshufd $MASK2, MSG_LOW, MSG_LOW
+ pinsrq $0, (%r8), MSG_LOW
+
+.Lld_partial_8:
+ mov %rcx, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ vpermq $MASK2, MSG, MSG
+ movdqu (%rsi), MSG_LOW
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0_LOW, (%r9)
+ vpermq $MASK2, T0, T0
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0_LOW, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ pextrq $1, T0_LOW, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_avx2_init(void *state, const void *key,
+ * const void *iv);
+ */
+ENTRY(crypto_morus1280_avx2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ vpxor STATE0, STATE0, STATE0
+ movdqu (%rdx), STATE0_LOW
+ /* load key: */
+ vmovdqu (%rsi), KEY
+ vmovdqa KEY, STATE1
+ /* load all ones: */
+ vpcmpeqd STATE2, STATE2, STATE2
+ /* load all zeros: */
+ vpxor STATE3, STATE3, STATE3
+ /* load the constant: */
+ vmovdqa .Lmorus1280_const, STATE4
+
+ /* update 16 times with zero: */
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+
+ /* xor-in the key again after updates: */
+ vpxor KEY, STATE1, STATE1
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_init)
+
+/*
+ * void crypto_morus1280_avx2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_ad)
+ FRAME_BEGIN
+
+ cmp $32, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ and $0x1F, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ vmovdqa (%rsi), MSG
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ vmovdqu (%rsi), MSG
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_ad)
+
+/*
+ * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0x1F, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ vmovdqa (%rsi), MSG
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+ vmovdqa T0, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ vmovdqu (%rsi), MSG
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+ vmovdqu T0, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_enc)
+
+/*
+ * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+
+ call __store_partial
+
+ call __morus1280_update
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ENDPROC(crypto_morus1280_avx2_enc_tail)
+
+/*
+ * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0x1F, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ vmovdqa (%rsi), MSG
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqa MSG, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ vmovdqu (%rsi), MSG
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqu MSG, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_dec)
+
+/*
+ * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqa MSG, T0
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0_LOW
+ vpbroadcastb T0_LOW, T0
+ vmovdqa .Lmorus1280_counter, T1
+ vpcmpgtb T1, T0, T0
+ vpand T0, MSG, MSG
+
+ call __morus1280_update
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_dec_tail)
+
+/*
+ * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_avx2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* xor state[0] into state[4]: */
+ vpxor STATE0, STATE4, STATE4
+
+ /* prepare length block: */
+ vpxor MSG, MSG, MSG
+ vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
+ vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
+ vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
+
+ /* update state: */
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+
+ /* xor tag: */
+ vmovdqu (%rsi), MSG
+
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_final)
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
new file mode 100644
index 000000000000..f111f36d26dc
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Glue for AVX2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
+
+static const struct x86_cpu_id avx2_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AVX2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
+
+static int __init crypto_morus1280_avx2_module_init(void)
+{
+ if (!x86_match_cpu(avx2_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus1280_avx2_algs,
+ ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+static void __exit crypto_morus1280_avx2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus1280_avx2_algs,
+ ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+module_init(crypto_morus1280_avx2_module_init);
+module_exit(crypto_morus1280_avx2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-avx2");
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
new file mode 100644
index 000000000000..1fe637c7be9d
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -0,0 +1,895 @@
+/*
+ * SSE2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+
+#define STATE0_LO %xmm0
+#define STATE0_HI %xmm1
+#define STATE1_LO %xmm2
+#define STATE1_HI %xmm3
+#define STATE2_LO %xmm4
+#define STATE2_HI %xmm5
+#define STATE3_LO %xmm6
+#define STATE3_HI %xmm7
+#define STATE4_LO %xmm8
+#define STATE4_HI %xmm9
+#define KEY_LO %xmm10
+#define KEY_HI %xmm11
+#define MSG_LO %xmm10
+#define MSG_HI %xmm11
+#define T0_LO %xmm12
+#define T0_HI %xmm13
+#define T1_LO %xmm14
+#define T1_HI %xmm15
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 16
+.align 16
+.Lmorus640_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter_0:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lmorus640_counter_1:
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro rol1 hi, lo
+ /*
+ * HI_1 | HI_0 || LO_1 | LO_0
+ * ==>
+ * HI_0 | HI_1 || LO_1 | LO_0
+ * ==>
+ * HI_0 | LO_1 || LO_0 | HI_1
+ */
+ pshufd $MASK2, \hi, \hi
+ movdqa \hi, T0_LO
+ punpcklqdq \lo, T0_LO
+ punpckhqdq \hi, \lo
+ movdqa \lo, \hi
+ movdqa T0_LO, \lo
+.endm
+
+.macro rol2 hi, lo
+ movdqa \lo, T0_LO
+ movdqa \hi, \lo
+ movdqa T0_LO, \hi
+.endm
+
+.macro rol3 hi, lo
+ /*
+ * HI_1 | HI_0 || LO_1 | LO_0
+ * ==>
+ * HI_0 | HI_1 || LO_1 | LO_0
+ * ==>
+ * LO_0 | HI_1 || HI_0 | LO_1
+ */
+ pshufd $MASK2, \hi, \hi
+ movdqa \lo, T0_LO
+ punpckhqdq \hi, T0_LO
+ punpcklqdq \lo, \hi
+ movdqa T0_LO, \lo
+.endm
+
+.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
+ movdqa \s1_l, T0_LO
+ pand \s2_l, T0_LO
+ pxor T0_LO, \s0_l
+
+ movdqa \s1_h, T0_LO
+ pand \s2_h, T0_LO
+ pxor T0_LO, \s0_h
+
+ pxor \s3_l, \s0_l
+ pxor \s3_h, \s0_h
+
+ movdqa \s0_l, T0_LO
+ psllq $\b, T0_LO
+ psrlq $(64 - \b), \s0_l
+ pxor T0_LO, \s0_l
+
+ movdqa \s0_h, T0_LO
+ psllq $\b, T0_LO
+ psrlq $(64 - \b), \s0_h
+ pxor T0_LO, \s0_h
+
+ \w \s3_h, \s3_l
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update:
+ morus1280_round \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ 13, rol1
+ pxor MSG_LO, STATE1_LO
+ pxor MSG_HI, STATE1_HI
+ morus1280_round \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ 46, rol2
+ pxor MSG_LO, STATE2_LO
+ pxor MSG_HI, STATE2_HI
+ morus1280_round \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ 38, rol3
+ pxor MSG_LO, STATE3_LO
+ pxor MSG_HI, STATE3_HI
+ morus1280_round \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ 7, rol2
+ pxor MSG_LO, STATE4_LO
+ pxor MSG_HI, STATE4_HI
+ morus1280_round \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ 4, rol1
+ ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update_zero:
+ morus1280_round \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ 13, rol1
+ morus1280_round \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ 46, rol2
+ morus1280_round \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ 38, rol3
+ morus1280_round \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ 7, rol2
+ morus1280_round \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ 4, rol1
+ ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ pxor MSG_LO, MSG_LO
+ pxor MSG_HI, MSG_HI
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG_LO
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pslldq $8, MSG_LO
+ movq (%r8), T0_LO
+ pxor T0_LO, MSG_LO
+
+.Lld_partial_8:
+ mov %rcx, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ movdqa MSG_LO, MSG_HI
+ movdqu (%rsi), MSG_LO
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0_LO, (%r9)
+ movdqa T0_HI, T0_LO
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0_LO, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0_LO
+ movq T0_LO, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_sse2_init(void *state, const void *key,
+ * const void *iv);
+ */
+ENTRY(crypto_morus1280_sse2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ pxor STATE0_HI, STATE0_HI
+ movdqu (%rdx), STATE0_LO
+ /* load key: */
+ movdqu 0(%rsi), KEY_LO
+ movdqu 16(%rsi), KEY_HI
+ movdqa KEY_LO, STATE1_LO
+ movdqa KEY_HI, STATE1_HI
+ /* load all ones: */
+ pcmpeqd STATE2_LO, STATE2_LO
+ pcmpeqd STATE2_HI, STATE2_HI
+ /* load all zeros: */
+ pxor STATE3_LO, STATE3_LO
+ pxor STATE3_HI, STATE3_HI
+ /* load the constant: */
+ movdqa .Lmorus640_const_0, STATE4_LO
+ movdqa .Lmorus640_const_1, STATE4_HI
+
+ /* update 16 times with zero: */
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+
+ /* xor-in the key again after updates: */
+ pxor KEY_LO, STATE1_LO
+ pxor KEY_HI, STATE1_HI
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_init)
+
+/*
+ * void crypto_morus1280_sse2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_ad)
+ FRAME_BEGIN
+
+ cmp $32, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_ad)
+
+/*
+ * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ movdqa T0_LO, 0(%rdx)
+ movdqa T0_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ movdqu T0_LO, 0(%rdx)
+ movdqu T0_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_enc)
+
+/*
+ * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+
+ call __store_partial
+
+ call __morus1280_update
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ENDPROC(crypto_morus1280_sse2_enc_tail)
+
+/*
+ * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa MSG_LO, 0(%rdx)
+ movdqa MSG_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqu MSG_LO, 0(%rdx)
+ movdqu MSG_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_dec)
+
+/*
+ * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ movdqa T0_LO, T0_HI
+ movdqa .Lmorus640_counter_0, T1_LO
+ movdqa .Lmorus640_counter_1, T1_HI
+ pcmpgtb T1_LO, T0_LO
+ pcmpgtb T1_HI, T0_HI
+ pand T0_LO, MSG_LO
+ pand T0_HI, MSG_HI
+
+ call __morus1280_update
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_dec_tail)
+
+/*
+ * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_sse2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* xor state[0] into state[4]: */
+ pxor STATE0_LO, STATE4_LO
+ pxor STATE0_HI, STATE4_HI
+
+ /* prepare length block: */
+ movq %rdx, MSG_LO
+ movq %rcx, T0_LO
+ pslldq $8, T0_LO
+ pxor T0_LO, MSG_LO
+ psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
+ pxor MSG_HI, MSG_HI
+
+ /* update state: */
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+
+ /* xor tag: */
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T0_LO
+ movdqa STATE1_HI, T0_HI
+ rol3 T0_HI, T0_LO
+ pxor T0_LO, MSG_LO
+ pxor T0_HI, MSG_HI
+ movdqa STATE2_LO, T0_LO
+ movdqa STATE2_HI, T0_HI
+ pand STATE3_LO, T0_LO
+ pand STATE3_HI, T0_HI
+ pxor T0_LO, MSG_LO
+ pxor T0_HI, MSG_HI
+
+ movdqu MSG_LO, 0(%rsi)
+ movdqu MSG_HI, 16(%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_final)
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
new file mode 100644
index 000000000000..839270aa713c
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus1280_sse2_module_init(void)
+{
+ if (!x86_match_cpu(sse2_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus1280_sse2_algs,
+ ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+static void __exit crypto_morus1280_sse2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus1280_sse2_algs,
+ ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+module_init(crypto_morus1280_sse2_module_init);
+module_exit(crypto_morus1280_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-sse2");
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
new file mode 100644
index 000000000000..0dccdda1eb3a
--- /dev/null
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -0,0 +1,302 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus1280_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus1280_state {
+ struct morus1280_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus1280_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, const void *src, void *dst,
+ unsigned int length);
+ void (*crypt_tail)(void *state, const void *src, void *dst,
+ unsigned int length);
+};
+
+static void crypto_morus1280_glue_process_ad(
+ struct morus1280_state *state,
+ const struct morus1280_glue_ops *ops,
+ struct scatterlist *sg_src, unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct morus1280_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= MORUS1280_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ ops->ad(state, src, left);
+ src += left & ~(MORUS1280_BLOCK_SIZE - 1);
+ left &= MORUS1280_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+
+ pos += left;
+ assoclen -= size;
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
+ ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+ }
+}
+
+static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
+ struct morus1280_ops ops,
+ struct aead_request *req)
+{
+ struct skcipher_walk walk;
+ u8 *cursor_src, *cursor_dst;
+ unsigned int chunksize, base;
+
+ ops.skcipher_walk_init(&walk, req, false);
+
+ while (walk.nbytes) {
+ cursor_src = walk.src.virt.addr;
+ cursor_dst = walk.dst.virt.addr;
+ chunksize = walk.nbytes;
+
+ ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+ base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
+ cursor_src += base;
+ cursor_dst += base;
+ chunksize &= MORUS1280_BLOCK_SIZE - 1;
+
+ if (chunksize > 0)
+ ops.crypt_tail(state, cursor_src, cursor_dst,
+ chunksize);
+
+ skcipher_walk_done(&walk, 0);
+ }
+}
+
+int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+
+ if (keylen == MORUS1280_BLOCK_SIZE) {
+ memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE);
+ } else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
+ memcpy(ctx->key.bytes, key, keylen);
+ memcpy(ctx->key.bytes + keylen, key, keylen);
+ } else {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey);
+
+int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize);
+
+static void crypto_morus1280_glue_crypt(struct aead_request *req,
+ struct morus1280_ops ops,
+ unsigned int cryptlen,
+ struct morus1280_block *tag_xor)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_state state;
+
+ kernel_fpu_begin();
+
+ ctx->ops->init(&state, &ctx->key, req->iv);
+ crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+ crypto_morus1280_glue_process_crypt(&state, ops, req);
+ ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+int crypto_morus1280_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = ctx->ops->enc,
+ .crypt_tail = ctx->ops->enc_tail,
+ };
+
+ struct morus1280_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt);
+
+int crypto_morus1280_glue_decrypt(struct aead_request *req)
+{
+ static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = ctx->ops->dec,
+ .crypt_tail = ctx->ops->dec_tail,
+ };
+
+ struct morus1280_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+ return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt);
+
+void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
+ const struct morus1280_glue_ops *ops)
+{
+ struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+ ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
+
+int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
+
+int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
+
+int cryptd_morus1280_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
+
+int cryptd_morus1280_glue_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
+
+int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+ char internal_name[CRYPTO_MAX_ALG_NAME];
+
+ if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+ >= CRYPTO_MAX_ALG_NAME)
+ return -ENAMETOOLONG;
+
+ cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
+
+void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
new file mode 100644
index 000000000000..71c72a0a0862
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -0,0 +1,614 @@
+/*
+ * SSE2 implementation of MORUS-640
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 32
+.align 16
+.Lmorus640_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+.macro morus640_round s0, s1, s2, s3, s4, b, w
+ movdqa \s1, T0
+ pand \s2, T0
+ pxor T0, \s0
+ pxor \s3, \s0
+ movdqa \s0, T0
+ pslld $\b, T0
+ psrld $(32 - \b), \s0
+ pxor T0, \s0
+ pshufd $\w, \s3, \s3
+.endm
+
+/*
+ * __morus640_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus640_update:
+ morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
+ pxor MSG, STATE1
+ morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+ pxor MSG, STATE2
+ morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
+ pxor MSG, STATE3
+ morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+ pxor MSG, STATE4
+ morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+ ret
+ENDPROC(__morus640_update)
+
+
+/*
+ * __morus640_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus640_update_zero:
+ morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
+ morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+ morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
+ morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+ morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+ ret
+ENDPROC(__morus640_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9, %r9
+ pxor MSG, MSG
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_morus640_sse2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ movdqu (%rdx), STATE0
+ /* load key: */
+ movdqu (%rsi), KEY
+ movdqa KEY, STATE1
+ /* load all ones: */
+ pcmpeqd STATE2, STATE2
+ /* load the constants: */
+ movdqa .Lmorus640_const_0, STATE3
+ movdqa .Lmorus640_const_1, STATE4
+
+ /* update 16 times with zero: */
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+
+ /* xor-in the key again after updates: */
+ pxor KEY, STATE1
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_init)
+
+/*
+ * void crypto_morus640_sse2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_ad)
+ FRAME_BEGIN
+
+ cmp $16, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ movdqa (%rsi), MSG
+ call __morus640_update
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ movdqu (%rsi), MSG
+ call __morus640_update
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_ad)
+
+/*
+ * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc)
+ FRAME_BEGIN
+
+ cmp $16, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ movdqa (%rsi), MSG
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+ movdqa T0, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ movdqu (%rsi), MSG
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+ movdqu T0, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_enc)
+
+/*
+ * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ call __store_partial
+
+ call __morus640_update
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ENDPROC(crypto_morus640_sse2_enc_tail)
+
+/*
+ * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec)
+ FRAME_BEGIN
+
+ cmp $16, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ movdqa (%rsi), MSG
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqa MSG, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ movdqu (%rsi), MSG
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqu MSG, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_dec)
+
+/*
+ * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqa MSG, T0
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Lmorus640_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ call __morus640_update
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_dec_tail)
+
+/*
+ * void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus640_sse2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* xor state[0] into state[4]: */
+ pxor STATE0, STATE4
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ /* update state: */
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_final)
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
new file mode 100644
index 000000000000..26b47e2db8d2
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ * Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus640_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus640_sse2_module_init(void)
+{
+ if (!x86_match_cpu(sse2_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus640_sse2_algs,
+ ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+static void __exit crypto_morus640_sse2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus640_sse2_algs,
+ ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+module_init(crypto_morus640_sse2_module_init);
+module_exit(crypto_morus640_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus640");
+MODULE_ALIAS_CRYPTO("morus640-sse2");
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
new file mode 100644
index 000000000000..7b58fe4d9bd1
--- /dev/null
+++ b/arch/x86/crypto/morus640_glue.c
@@ -0,0 +1,298 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ * Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus640_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus640_state {
+ struct morus640_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus640_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, const void *src, void *dst,
+ unsigned int length);
+ void (*crypt_tail)(void *state, const void *src, void *dst,
+ unsigned int length);
+};
+
+static void crypto_morus640_glue_process_ad(
+ struct morus640_state *state,
+ const struct morus640_glue_ops *ops,
+ struct scatterlist *sg_src, unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct morus640_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= MORUS640_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = MORUS640_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ ops->ad(state, src, left);
+ src += left & ~(MORUS640_BLOCK_SIZE - 1);
+ left &= MORUS640_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+
+ pos += left;
+ assoclen -= size;
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
+ ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+ }
+}
+
+static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
+ struct morus640_ops ops,
+ struct aead_request *req)
+{
+ struct skcipher_walk walk;
+ u8 *cursor_src, *cursor_dst;
+ unsigned int chunksize, base;
+
+ ops.skcipher_walk_init(&walk, req, false);
+
+ while (walk.nbytes) {
+ cursor_src = walk.src.virt.addr;
+ cursor_dst = walk.dst.virt.addr;
+ chunksize = walk.nbytes;
+
+ ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+ base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
+ cursor_src += base;
+ cursor_dst += base;
+ chunksize &= MORUS640_BLOCK_SIZE - 1;
+
+ if (chunksize > 0)
+ ops.crypt_tail(state, cursor_src, cursor_dst,
+ chunksize);
+
+ skcipher_walk_done(&walk, 0);
+ }
+}
+
+int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+
+ if (keylen != MORUS640_BLOCK_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey);
+
+int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize);
+
+static void crypto_morus640_glue_crypt(struct aead_request *req,
+ struct morus640_ops ops,
+ unsigned int cryptlen,
+ struct morus640_block *tag_xor)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_state state;
+
+ kernel_fpu_begin();
+
+ ctx->ops->init(&state, &ctx->key, req->iv);
+ crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+ crypto_morus640_glue_process_crypt(&state, ops, req);
+ ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+int crypto_morus640_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = ctx->ops->enc,
+ .crypt_tail = ctx->ops->enc_tail,
+ };
+
+ struct morus640_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt);
+
+int crypto_morus640_glue_decrypt(struct aead_request *req)
+{
+ static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = ctx->ops->dec,
+ .crypt_tail = ctx->ops->dec_tail,
+ };
+
+ struct morus640_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+ return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt);
+
+void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
+ const struct morus640_glue_ops *ops)
+{
+ struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+ ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
+
+int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
+
+int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
+
+int cryptd_morus640_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
+
+int cryptd_morus640_glue_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
+
+int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+ char internal_name[CRYPTO_MAX_ALG_NAME];
+
+ if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+ >= CRYPTO_MAX_ALG_NAME)
+ return -ENAMETOOLONG;
+
+ cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
+
+void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644
index 6014b7b9e52a..000000000000
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,938 +0,0 @@
-# Derived from:
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
-
-#include <linux/linkage.h>
-
-.text
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,80(%esp)
- # ebx_stack = ebx
- movl %ebx,84(%esp)
- # esi_stack = esi
- movl %esi,88(%esp)
- # edi_stack = edi
- movl %edi,92(%esp)
- # ebp_stack = ebp
- movl %ebp,96(%esp)
- # x = arg1
- movl 4(%esp,%eax),%edx
- # m = arg2
- movl 8(%esp,%eax),%esi
- # out = arg3
- movl 12(%esp,%eax),%edi
- # bytes = arg4
- movl 16(%esp,%eax),%ebx
- # bytes -= 0
- sub $0,%ebx
- # goto done if unsigned<=
- jbe ._done
-._start:
- # in0 = *(uint32 *) (x + 0)
- movl 0(%edx),%eax
- # in1 = *(uint32 *) (x + 4)
- movl 4(%edx),%ecx
- # in2 = *(uint32 *) (x + 8)
- movl 8(%edx),%ebp
- # j0 = in0
- movl %eax,164(%esp)
- # in3 = *(uint32 *) (x + 12)
- movl 12(%edx),%eax
- # j1 = in1
- movl %ecx,168(%esp)
- # in4 = *(uint32 *) (x + 16)
- movl 16(%edx),%ecx
- # j2 = in2
- movl %ebp,172(%esp)
- # in5 = *(uint32 *) (x + 20)
- movl 20(%edx),%ebp
- # j3 = in3
- movl %eax,176(%esp)
- # in6 = *(uint32 *) (x + 24)
- movl 24(%edx),%eax
- # j4 = in4
- movl %ecx,180(%esp)
- # in7 = *(uint32 *) (x + 28)
- movl 28(%edx),%ecx
- # j5 = in5
- movl %ebp,184(%esp)
- # in8 = *(uint32 *) (x + 32)
- movl 32(%edx),%ebp
- # j6 = in6
- movl %eax,188(%esp)
- # in9 = *(uint32 *) (x + 36)
- movl 36(%edx),%eax
- # j7 = in7
- movl %ecx,192(%esp)
- # in10 = *(uint32 *) (x + 40)
- movl 40(%edx),%ecx
- # j8 = in8
- movl %ebp,196(%esp)
- # in11 = *(uint32 *) (x + 44)
- movl 44(%edx),%ebp
- # j9 = in9
- movl %eax,200(%esp)
- # in12 = *(uint32 *) (x + 48)
- movl 48(%edx),%eax
- # j10 = in10
- movl %ecx,204(%esp)
- # in13 = *(uint32 *) (x + 52)
- movl 52(%edx),%ecx
- # j11 = in11
- movl %ebp,208(%esp)
- # in14 = *(uint32 *) (x + 56)
- movl 56(%edx),%ebp
- # j12 = in12
- movl %eax,212(%esp)
- # in15 = *(uint32 *) (x + 60)
- movl 60(%edx),%eax
- # j13 = in13
- movl %ecx,216(%esp)
- # j14 = in14
- movl %ebp,220(%esp)
- # j15 = in15
- movl %eax,224(%esp)
- # x_backup = x
- movl %edx,64(%esp)
-._bytesatleast1:
- # bytes - 64
- cmp $64,%ebx
- # goto nocopy if unsigned>=
- jae ._nocopy
- # ctarget = out
- movl %edi,228(%esp)
- # out = &tmp
- leal 0(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leal 0(%esp),%edi
- # m = &tmp
- leal 0(%esp),%esi
-._nocopy:
- # out_backup = out
- movl %edi,72(%esp)
- # m_backup = m
- movl %esi,68(%esp)
- # bytes_backup = bytes
- movl %ebx,76(%esp)
- # in0 = j0
- movl 164(%esp),%eax
- # in1 = j1
- movl 168(%esp),%ecx
- # in2 = j2
- movl 172(%esp),%edx
- # in3 = j3
- movl 176(%esp),%ebx
- # x0 = in0
- movl %eax,100(%esp)
- # x1 = in1
- movl %ecx,104(%esp)
- # x2 = in2
- movl %edx,108(%esp)
- # x3 = in3
- movl %ebx,112(%esp)
- # in4 = j4
- movl 180(%esp),%eax
- # in5 = j5
- movl 184(%esp),%ecx
- # in6 = j6
- movl 188(%esp),%edx
- # in7 = j7
- movl 192(%esp),%ebx
- # x4 = in4
- movl %eax,116(%esp)
- # x5 = in5
- movl %ecx,120(%esp)
- # x6 = in6
- movl %edx,124(%esp)
- # x7 = in7
- movl %ebx,128(%esp)
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in10 = j10
- movl 204(%esp),%edx
- # in11 = j11
- movl 208(%esp),%ebx
- # x8 = in8
- movl %eax,132(%esp)
- # x9 = in9
- movl %ecx,136(%esp)
- # x10 = in10
- movl %edx,140(%esp)
- # x11 = in11
- movl %ebx,144(%esp)
- # in12 = j12
- movl 212(%esp),%eax
- # in13 = j13
- movl 216(%esp),%ecx
- # in14 = j14
- movl 220(%esp),%edx
- # in15 = j15
- movl 224(%esp),%ebx
- # x12 = in12
- movl %eax,148(%esp)
- # x13 = in13
- movl %ecx,152(%esp)
- # x14 = in14
- movl %edx,156(%esp)
- # x15 = in15
- movl %ebx,160(%esp)
- # i = 20
- mov $20,%ebp
- # p = x0
- movl 100(%esp),%eax
- # s = x5
- movl 120(%esp),%ecx
- # t = x10
- movl 140(%esp),%edx
- # w = x15
- movl 160(%esp),%ebx
-._mainloop:
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # i -= 4
- sub $4,%ebp
- # goto mainloop if unsigned >
- ja ._mainloop
- # x0 = p
- movl %eax,100(%esp)
- # x5 = s
- movl %ecx,120(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # x15 = w
- movl %ebx,160(%esp)
- # out = out_backup
- movl 72(%esp),%edi
- # m = m_backup
- movl 68(%esp),%esi
- # in0 = x0
- movl 100(%esp),%eax
- # in1 = x1
- movl 104(%esp),%ecx
- # in0 += j0
- addl 164(%esp),%eax
- # in1 += j1
- addl 168(%esp),%ecx
- # in0 ^= *(uint32 *) (m + 0)
- xorl 0(%esi),%eax
- # in1 ^= *(uint32 *) (m + 4)
- xorl 4(%esi),%ecx
- # *(uint32 *) (out + 0) = in0
- movl %eax,0(%edi)
- # *(uint32 *) (out + 4) = in1
- movl %ecx,4(%edi)
- # in2 = x2
- movl 108(%esp),%eax
- # in3 = x3
- movl 112(%esp),%ecx
- # in2 += j2
- addl 172(%esp),%eax
- # in3 += j3
- addl 176(%esp),%ecx
- # in2 ^= *(uint32 *) (m + 8)
- xorl 8(%esi),%eax
- # in3 ^= *(uint32 *) (m + 12)
- xorl 12(%esi),%ecx
- # *(uint32 *) (out + 8) = in2
- movl %eax,8(%edi)
- # *(uint32 *) (out + 12) = in3
- movl %ecx,12(%edi)
- # in4 = x4
- movl 116(%esp),%eax
- # in5 = x5
- movl 120(%esp),%ecx
- # in4 += j4
- addl 180(%esp),%eax
- # in5 += j5
- addl 184(%esp),%ecx
- # in4 ^= *(uint32 *) (m + 16)
- xorl 16(%esi),%eax
- # in5 ^= *(uint32 *) (m + 20)
- xorl 20(%esi),%ecx
- # *(uint32 *) (out + 16) = in4
- movl %eax,16(%edi)
- # *(uint32 *) (out + 20) = in5
- movl %ecx,20(%edi)
- # in6 = x6
- movl 124(%esp),%eax
- # in7 = x7
- movl 128(%esp),%ecx
- # in6 += j6
- addl 188(%esp),%eax
- # in7 += j7
- addl 192(%esp),%ecx
- # in6 ^= *(uint32 *) (m + 24)
- xorl 24(%esi),%eax
- # in7 ^= *(uint32 *) (m + 28)
- xorl 28(%esi),%ecx
- # *(uint32 *) (out + 24) = in6
- movl %eax,24(%edi)
- # *(uint32 *) (out + 28) = in7
- movl %ecx,28(%edi)
- # in8 = x8
- movl 132(%esp),%eax
- # in9 = x9
- movl 136(%esp),%ecx
- # in8 += j8
- addl 196(%esp),%eax
- # in9 += j9
- addl 200(%esp),%ecx
- # in8 ^= *(uint32 *) (m + 32)
- xorl 32(%esi),%eax
- # in9 ^= *(uint32 *) (m + 36)
- xorl 36(%esi),%ecx
- # *(uint32 *) (out + 32) = in8
- movl %eax,32(%edi)
- # *(uint32 *) (out + 36) = in9
- movl %ecx,36(%edi)
- # in10 = x10
- movl 140(%esp),%eax
- # in11 = x11
- movl 144(%esp),%ecx
- # in10 += j10
- addl 204(%esp),%eax
- # in11 += j11
- addl 208(%esp),%ecx
- # in10 ^= *(uint32 *) (m + 40)
- xorl 40(%esi),%eax
- # in11 ^= *(uint32 *) (m + 44)
- xorl 44(%esi),%ecx
- # *(uint32 *) (out + 40) = in10
- movl %eax,40(%edi)
- # *(uint32 *) (out + 44) = in11
- movl %ecx,44(%edi)
- # in12 = x12
- movl 148(%esp),%eax
- # in13 = x13
- movl 152(%esp),%ecx
- # in12 += j12
- addl 212(%esp),%eax
- # in13 += j13
- addl 216(%esp),%ecx
- # in12 ^= *(uint32 *) (m + 48)
- xorl 48(%esi),%eax
- # in13 ^= *(uint32 *) (m + 52)
- xorl 52(%esi),%ecx
- # *(uint32 *) (out + 48) = in12
- movl %eax,48(%edi)
- # *(uint32 *) (out + 52) = in13
- movl %ecx,52(%edi)
- # in14 = x14
- movl 156(%esp),%eax
- # in15 = x15
- movl 160(%esp),%ecx
- # in14 += j14
- addl 220(%esp),%eax
- # in15 += j15
- addl 224(%esp),%ecx
- # in14 ^= *(uint32 *) (m + 56)
- xorl 56(%esi),%eax
- # in15 ^= *(uint32 *) (m + 60)
- xorl 60(%esi),%ecx
- # *(uint32 *) (out + 56) = in14
- movl %eax,56(%edi)
- # *(uint32 *) (out + 60) = in15
- movl %ecx,60(%edi)
- # bytes = bytes_backup
- movl 76(%esp),%ebx
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in8 += 1
- add $1,%eax
- # in9 += 0 + carry
- adc $0,%ecx
- # j8 = in8
- movl %eax,196(%esp)
- # j9 = in9
- movl %ecx,200(%esp)
- # bytes - 64
- cmp $64,%ebx
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # goto bytesatleast64 if unsigned>=
- jae ._bytesatleast64
- # m = out
- mov %edi,%esi
- # out = ctarget
- movl 228(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
-._bytesatleast64:
- # x = x_backup
- movl 64(%esp),%eax
- # in8 = j8
- movl 196(%esp),%ecx
- # in9 = j9
- movl 200(%esp),%edx
- # *(uint32 *) (x + 32) = in8
- movl %ecx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %edx,36(%eax)
-._done:
- # eax = eax_stack
- movl 80(%esp),%eax
- # ebx = ebx_stack
- movl 84(%esp),%ebx
- # esi = esi_stack
- movl 88(%esp),%esi
- # edi = edi_stack
- movl 92(%esp),%edi
- # ebp = ebp_stack
- movl 96(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-._bytesatleast65:
- # bytes -= 64
- sub $64,%ebx
- # out += 64
- add $64,%edi
- # m += 64
- add $64,%esi
- # goto bytesatleast1
- jmp ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644
index 03a4918f41ee..000000000000
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # x = arg1
- mov %rdi,%r8
- # m = arg2
- mov %rsi,%rsi
- # out = arg3
- mov %rdx,%rdi
- # bytes = arg4
- mov %rcx,%rdx
- # unsigned>? bytes - 0
- cmp $0,%rdx
- # comment:fp stack unchanged by jump
- # goto done if !unsigned>
- jbe ._done
- # comment:fp stack unchanged by fallthrough
-# start:
-._start:
- # r11_stack = r11
- movq %r11,0(%rsp)
- # r12_stack = r12
- movq %r12,8(%rsp)
- # r13_stack = r13
- movq %r13,16(%rsp)
- # r14_stack = r14
- movq %r14,24(%rsp)
- # r15_stack = r15
- movq %r15,32(%rsp)
- # rbx_stack = rbx
- movq %rbx,40(%rsp)
- # rbp_stack = rbp
- movq %rbp,48(%rsp)
- # in0 = *(uint64 *) (x + 0)
- movq 0(%r8),%rcx
- # in2 = *(uint64 *) (x + 8)
- movq 8(%r8),%r9
- # in4 = *(uint64 *) (x + 16)
- movq 16(%r8),%rax
- # in6 = *(uint64 *) (x + 24)
- movq 24(%r8),%r10
- # in8 = *(uint64 *) (x + 32)
- movq 32(%r8),%r11
- # in10 = *(uint64 *) (x + 40)
- movq 40(%r8),%r12
- # in12 = *(uint64 *) (x + 48)
- movq 48(%r8),%r13
- # in14 = *(uint64 *) (x + 56)
- movq 56(%r8),%r14
- # j0 = in0
- movq %rcx,56(%rsp)
- # j2 = in2
- movq %r9,64(%rsp)
- # j4 = in4
- movq %rax,72(%rsp)
- # j6 = in6
- movq %r10,80(%rsp)
- # j8 = in8
- movq %r11,88(%rsp)
- # j10 = in10
- movq %r12,96(%rsp)
- # j12 = in12
- movq %r13,104(%rsp)
- # j14 = in14
- movq %r14,112(%rsp)
- # x_backup = x
- movq %r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
- # unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto nocopy if !unsigned<
- jae ._nocopy
- # ctarget = out
- movq %rdi,128(%rsp)
- # out = &tmp
- leaq 192(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leaq 192(%rsp),%rdi
- # m = &tmp
- leaq 192(%rsp),%rsi
- # comment:fp stack unchanged by fallthrough
-# nocopy:
-._nocopy:
- # out_backup = out
- movq %rdi,136(%rsp)
- # m_backup = m
- movq %rsi,144(%rsp)
- # bytes_backup = bytes
- movq %rdx,152(%rsp)
- # x1 = j0
- movq 56(%rsp),%rdi
- # x0 = x1
- mov %rdi,%rdx
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x3 = j2
- movq 64(%rsp),%rsi
- # x2 = x3
- mov %rsi,%rcx
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x5 = j4
- movq 72(%rsp),%r8
- # x4 = x5
- mov %r8,%r9
- # (uint64) x5 >>= 32
- shr $32,%r8
- # x5_stack = x5
- movq %r8,160(%rsp)
- # x7 = j6
- movq 80(%rsp),%r8
- # x6 = x7
- mov %r8,%rax
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x9 = j8
- movq 88(%rsp),%r10
- # x8 = x9
- mov %r10,%r11
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x11 = j10
- movq 96(%rsp),%r12
- # x10 = x11
- mov %r12,%r13
- # x10_stack = x10
- movq %r13,168(%rsp)
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x13 = j12
- movq 104(%rsp),%r13
- # x12 = x13
- mov %r13,%r14
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x15 = j14
- movq 112(%rsp),%r15
- # x14 = x15
- mov %r15,%rbx
- # (uint64) x15 >>= 32
- shr $32,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = 20
- mov $20,%r15
-# mainloop:
-._mainloop:
- # i_backup = i
- movq %r15,184(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = i_backup
- movq 184(%rsp),%r15
- # unsigned>? i -= 4
- sub $4,%r15
- # comment:fp stack unchanged by jump
- # goto mainloop if unsigned>
- ja ._mainloop
- # (uint32) x2 += j2
- addl 64(%rsp),%ecx
- # x3 <<= 32
- shl $32,%rsi
- # x3 += j2
- addq 64(%rsp),%rsi
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x3 <<= 32
- shl $32,%rsi
- # x2 += x3
- add %rsi,%rcx
- # (uint32) x6 += j6
- addl 80(%rsp),%eax
- # x7 <<= 32
- shl $32,%r8
- # x7 += j6
- addq 80(%rsp),%r8
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x7 <<= 32
- shl $32,%r8
- # x6 += x7
- add %r8,%rax
- # (uint32) x8 += j8
- addl 88(%rsp),%r11d
- # x9 <<= 32
- shl $32,%r10
- # x9 += j8
- addq 88(%rsp),%r10
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x9 <<= 32
- shl $32,%r10
- # x8 += x9
- add %r10,%r11
- # (uint32) x12 += j12
- addl 104(%rsp),%r14d
- # x13 <<= 32
- shl $32,%r13
- # x13 += j12
- addq 104(%rsp),%r13
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x13 <<= 32
- shl $32,%r13
- # x12 += x13
- add %r13,%r14
- # (uint32) x0 += j0
- addl 56(%rsp),%edx
- # x1 <<= 32
- shl $32,%rdi
- # x1 += j0
- addq 56(%rsp),%rdi
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x1 <<= 32
- shl $32,%rdi
- # x0 += x1
- add %rdi,%rdx
- # x5 = x5_stack
- movq 160(%rsp),%rdi
- # (uint32) x4 += j4
- addl 72(%rsp),%r9d
- # x5 <<= 32
- shl $32,%rdi
- # x5 += j4
- addq 72(%rsp),%rdi
- # (uint64) x5 >>= 32
- shr $32,%rdi
- # x5 <<= 32
- shl $32,%rdi
- # x4 += x5
- add %rdi,%r9
- # x10 = x10_stack
- movq 168(%rsp),%r8
- # (uint32) x10 += j10
- addl 96(%rsp),%r8d
- # x11 <<= 32
- shl $32,%r12
- # x11 += j10
- addq 96(%rsp),%r12
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x11 <<= 32
- shl $32,%r12
- # x10 += x11
- add %r12,%r8
- # x15 = x15_stack
- movq 176(%rsp),%rdi
- # (uint32) x14 += j14
- addl 112(%rsp),%ebx
- # x15 <<= 32
- shl $32,%rdi
- # x15 += j14
- addq 112(%rsp),%rdi
- # (uint64) x15 >>= 32
- shr $32,%rdi
- # x15 <<= 32
- shl $32,%rdi
- # x14 += x15
- add %rdi,%rbx
- # out = out_backup
- movq 136(%rsp),%rdi
- # m = m_backup
- movq 144(%rsp),%rsi
- # x0 ^= *(uint64 *) (m + 0)
- xorq 0(%rsi),%rdx
- # *(uint64 *) (out + 0) = x0
- movq %rdx,0(%rdi)
- # x2 ^= *(uint64 *) (m + 8)
- xorq 8(%rsi),%rcx
- # *(uint64 *) (out + 8) = x2
- movq %rcx,8(%rdi)
- # x4 ^= *(uint64 *) (m + 16)
- xorq 16(%rsi),%r9
- # *(uint64 *) (out + 16) = x4
- movq %r9,16(%rdi)
- # x6 ^= *(uint64 *) (m + 24)
- xorq 24(%rsi),%rax
- # *(uint64 *) (out + 24) = x6
- movq %rax,24(%rdi)
- # x8 ^= *(uint64 *) (m + 32)
- xorq 32(%rsi),%r11
- # *(uint64 *) (out + 32) = x8
- movq %r11,32(%rdi)
- # x10 ^= *(uint64 *) (m + 40)
- xorq 40(%rsi),%r8
- # *(uint64 *) (out + 40) = x10
- movq %r8,40(%rdi)
- # x12 ^= *(uint64 *) (m + 48)
- xorq 48(%rsi),%r14
- # *(uint64 *) (out + 48) = x12
- movq %r14,48(%rdi)
- # x14 ^= *(uint64 *) (m + 56)
- xorq 56(%rsi),%rbx
- # *(uint64 *) (out + 56) = x14
- movq %rbx,56(%rdi)
- # bytes = bytes_backup
- movq 152(%rsp),%rdx
- # in8 = j8
- movq 88(%rsp),%rcx
- # in8 += 1
- add $1,%rcx
- # j8 = in8
- movq %rcx,88(%rsp)
- # unsigned>? unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # comment:fp stack unchanged by jump
- # goto bytesatleast64 if !unsigned<
- jae ._bytesatleast64
- # m = out
- mov %rdi,%rsi
- # out = ctarget
- movq 128(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # comment:fp stack unchanged by fallthrough
-# bytesatleast64:
-._bytesatleast64:
- # x = x_backup
- movq 120(%rsp),%rdi
- # in8 = j8
- movq 88(%rsp),%rsi
- # *(uint64 *) (x + 32) = in8
- movq %rsi,32(%rdi)
- # r11 = r11_stack
- movq 0(%rsp),%r11
- # r12 = r12_stack
- movq 8(%rsp),%r12
- # r13 = r13_stack
- movq 16(%rsp),%r13
- # r14 = r14_stack
- movq 24(%rsp),%r14
- # r15 = r15_stack
- movq 32(%rsp),%r15
- # rbx = rbx_stack
- movq 40(%rsp),%rbx
- # rbp = rbp_stack
- movq 48(%rsp),%rbp
- # comment:fp stack unchanged by fallthrough
-# done:
-._done:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-# bytesatleast65:
-._bytesatleast65:
- # bytes -= 64
- sub $64,%rdx
- # out += 64
- add $64,%rdi
- # m += 64
- add $64,%rsi
- # comment:fp stack unchanged by jump
- # goto bytesatleast1
- jmp ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644
index b07d7d959806..000000000000
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Glue code for optimized assembly version of Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * Also modified to set up the initial state using the generic C code rather
- * than in assembly.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <asm/unaligned.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/salsa20.h>
-#include <linux/module.h>
-
-asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
- u32 bytes);
-
-static int salsa20_asm_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- u32 state[16];
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_salsa20_init(state, ctx, walk.iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
-
- salsa20_encrypt_bytes(state, walk.src.virt.addr,
- walk.dst.virt.addr, nbytes);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- }
-
- return err;
-}
-
-static struct skcipher_alg alg = {
- .base.cra_name = "salsa20",
- .base.cra_driver_name = "salsa20-asm",
- .base.cra_priority = 200,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct salsa20_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = SALSA20_MIN_KEY_SIZE,
- .max_keysize = SALSA20_MAX_KEY_SIZE,
- .ivsize = SALSA20_IV_SIZE,
- .chunksize = SALSA20_BLOCK_SIZE,
- .setkey = crypto_salsa20_setkey,
- .encrypt = salsa20_asm_crypt,
- .decrypt = salsa20_asm_crypt,
-};
-
-static int __init init(void)
-{
- return crypto_register_skcipher(&alg);
-}
-
-static void __exit fini(void)
-{
- crypto_unregister_skcipher(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 08acd954f00e..74a9e06b6cfd 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -436,6 +436,8 @@ static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
#endif /* CONFIG_X86_LOCAL_APIC */
+extern void apic_ack_irq(struct irq_data *data);
+
static inline void ack_APIC_irq(void)
{
/*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index fb00a2fca990..5701f5cecd31 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -282,7 +282,9 @@
#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/mcsafe_test.h b/arch/x86/include/asm/mcsafe_test.h
new file mode 100644
index 000000000000..eb59804b6201
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_test.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MCSAFE_TEST_H_
+#define _MCSAFE_TEST_H_
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_MCSAFE_TEST
+extern unsigned long mcsafe_test_src;
+extern unsigned long mcsafe_test_dst;
+
+static inline void mcsafe_inject_src(void *addr)
+{
+ if (addr)
+ mcsafe_test_src = (unsigned long) addr;
+ else
+ mcsafe_test_src = ~0UL;
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+ if (addr)
+ mcsafe_test_dst = (unsigned long) addr;
+ else
+ mcsafe_test_dst = ~0UL;
+}
+#else /* CONFIG_MCSAFE_TEST */
+static inline void mcsafe_inject_src(void *addr)
+{
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+}
+#endif /* CONFIG_MCSAFE_TEST */
+
+#else /* __ASSEMBLY__ */
+#include <asm/export.h>
+
+#ifdef CONFIG_MCSAFE_TEST
+.macro MCSAFE_TEST_CTL
+ .pushsection .data
+ .align 8
+ .globl mcsafe_test_src
+ mcsafe_test_src:
+ .quad 0
+ EXPORT_SYMBOL_GPL(mcsafe_test_src)
+ .globl mcsafe_test_dst
+ mcsafe_test_dst:
+ .quad 0
+ EXPORT_SYMBOL_GPL(mcsafe_test_dst)
+ .popsection
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+ leaq \count(\reg), %r9
+ cmp mcsafe_test_src, %r9
+ ja \target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+ leaq \count(\reg), %r9
+ cmp mcsafe_test_dst, %r9
+ ja \target
+.endm
+#else
+.macro MCSAFE_TEST_CTL
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+.endm
+#endif /* CONFIG_MCSAFE_TEST */
+#endif /* __ASSEMBLY__ */
+#endif /* _MCSAFE_TEST_H_ */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index cf9911b5a53c..bbc796eb0a3b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -288,21 +288,6 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
- unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
- VM_PKEY_BIT2 | VM_PKEY_BIT3;
-
- return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-}
-#else
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
- return 0;
-}
-#endif
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fda2114197b3..68b2c3150de1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -62,8 +62,8 @@
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
#define NHM_C1_AUTO_DEMOTE (1UL << 26)
#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
-#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
-#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
+#define SNB_C3_AUTO_UNDEMOTE (1UL << 27)
+#define SNB_C1_AUTO_UNDEMOTE (1UL << 28)
#define MSR_MTRRcap 0x000000fe
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 8b38df98548e..f6f6c63da62f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -308,16 +308,20 @@ do { \
* lfence
* jmp spec_trap
* do_rop:
- * mov %rax,(%rsp)
+ * mov %rax,(%rsp) for x86_64
+ * mov %edx,(%esp) for x86_32
* retq
*
* Without retpolines configured:
*
- * jmp *%rax
+ * jmp *%rax for x86_64
+ * jmp *%edx for x86_32
*/
#ifdef CONFIG_RETPOLINE
-# define RETPOLINE_RAX_BPF_JIT_SIZE 17
-# define RETPOLINE_RAX_BPF_JIT() \
+# ifdef CONFIG_X86_64
+# define RETPOLINE_RAX_BPF_JIT_SIZE 17
+# define RETPOLINE_RAX_BPF_JIT() \
+do { \
EMIT1_off32(0xE8, 7); /* callq do_rop */ \
/* spec_trap: */ \
EMIT2(0xF3, 0x90); /* pause */ \
@@ -325,11 +329,30 @@ do { \
EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
/* do_rop: */ \
EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \
- EMIT1(0xC3); /* retq */
-#else
-# define RETPOLINE_RAX_BPF_JIT_SIZE 2
-# define RETPOLINE_RAX_BPF_JIT() \
- EMIT2(0xFF, 0xE0); /* jmp *%rax */
+ EMIT1(0xC3); /* retq */ \
+} while (0)
+# else /* !CONFIG_X86_64 */
+# define RETPOLINE_EDX_BPF_JIT() \
+do { \
+ EMIT1_off32(0xE8, 7); /* call do_rop */ \
+ /* spec_trap: */ \
+ EMIT2(0xF3, 0x90); /* pause */ \
+ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
+ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
+ /* do_rop: */ \
+ EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
+ EMIT1(0xC3); /* ret */ \
+} while (0)
+# endif
+#else /* !CONFIG_RETPOLINE */
+# ifdef CONFIG_X86_64
+# define RETPOLINE_RAX_BPF_JIT_SIZE 2
+# define RETPOLINE_RAX_BPF_JIT() \
+ EMIT2(0xFF, 0xE0); /* jmp *%rax */
+# else /* !CONFIG_X86_64 */
+# define RETPOLINE_EDX_BPF_JIT() \
+ EMIT2(0xFF, 0xE2) /* jmp *%edx */
+# endif
#endif
#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1e53560a84bb..c85e15010f48 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -17,7 +17,6 @@
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
-#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
@@ -55,6 +54,13 @@
#ifndef __ASSEMBLY__
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+extern phys_addr_t physical_mask;
+#define __PHYSICAL_MASK physical_mask
+#else
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#endif
+
extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 3c5385f9a88f..0fdcd21dadbd 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -216,7 +216,7 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
}
#endif
-static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
+static __always_inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
@@ -230,7 +230,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
*p4dp = native_make_p4d(native_pgd_val(pgd));
}
-static inline void native_p4d_clear(p4d_t *p4d)
+static __always_inline void native_p4d_clear(p4d_t *p4d)
{
native_set_p4d(p4d, native_make_p4d(0));
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 1e5a40673953..99fff853c944 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -65,7 +65,6 @@
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
#endif
-#define __HAVE_ARCH_PTE_SPECIAL
#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
_PAGE_PKEY_BIT1 | \
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 851c04b7a092..19b137f1b3be 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -9,6 +9,11 @@
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
+static inline bool arch_pkeys_enabled(void)
+{
+ return boot_cpu_has(X86_FEATURE_OSPKE);
+}
+
/*
* Try to dedicate one of the protection keys to be used as an
* execute-only protection key.
@@ -116,4 +121,12 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern void copy_init_pkru_to_fpregs(void);
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+ VM_PKEY_BIT2 | VM_PKEY_BIT3;
+
+ return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+
#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 22647a642e98..0af81b590a0c 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -236,7 +236,7 @@ TRACE_EVENT(vector_alloc,
TP_PROTO(unsigned int irq, unsigned int vector, bool reserved,
int ret),
- TP_ARGS(irq, vector, ret, reserved),
+ TP_ARGS(irq, vector, reserved, ret),
TP_STRUCT__entry(
__field( unsigned int, irq )
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2d27236c16a3..b85a7c54c6a1 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -301,5 +301,6 @@ extern struct x86_apic_ops x86_apic_ops;
extern void x86_early_init_platform_quirks(void);
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
+extern bool x86_pnpbios_disabled(void);
#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7553819c74c3..3982f79d2377 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1851,7 +1851,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
* intr-remapping table entry. Hence for the io-apic
* EOI we use the pin number.
*/
- ack_APIC_irq();
+ apic_ack_irq(irq_data);
eoi_ioapic_pin(data->entry.vector, data);
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index bb6f7a2148d7..35aaee4fc028 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -235,6 +235,15 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
return 0;
+ /*
+ * Careful here. @apicd might either have move_in_progress set or
+ * be enqueued for cleanup. Assigning a new vector would either
+ * leave a stale vector on some CPU around or in case of a pending
+ * cleanup corrupt the hlist.
+ */
+ if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist))
+ return -EBUSY;
+
vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
if (vector > 0)
apic_update_vector(irqd, vector, cpu);
@@ -579,8 +588,7 @@ error:
static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
struct irq_data *irqd, int ind)
{
- unsigned int cpu, vector, prev_cpu, prev_vector;
- struct apic_chip_data *apicd;
+ struct apic_chip_data apicd;
unsigned long flags;
int irq;
@@ -596,24 +604,26 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
return;
}
- apicd = irqd->chip_data;
- if (!apicd) {
+ if (!irqd->chip_data) {
seq_printf(m, "%*sVector: Not assigned\n", ind, "");
return;
}
raw_spin_lock_irqsave(&vector_lock, flags);
- cpu = apicd->cpu;
- vector = apicd->vector;
- prev_cpu = apicd->prev_cpu;
- prev_vector = apicd->prev_vector;
+ memcpy(&apicd, irqd->chip_data, sizeof(apicd));
raw_spin_unlock_irqrestore(&vector_lock, flags);
- seq_printf(m, "%*sVector: %5u\n", ind, "", vector);
- seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu);
- if (prev_vector) {
- seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector);
- seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu);
+
+ seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+ if (apicd.prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);
}
+ seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0);
+ seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0);
+ seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
+ seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
+ seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
}
#endif
@@ -800,13 +810,18 @@ static int apic_retrigger_irq(struct irq_data *irqd)
return 1;
}
-void apic_ack_edge(struct irq_data *irqd)
+void apic_ack_irq(struct irq_data *irqd)
{
- irq_complete_move(irqd_cfg(irqd));
irq_move_irq(irqd);
ack_APIC_irq();
}
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
static struct irq_chip lapic_controller = {
.name = "APIC",
.irq_ack = apic_ack_edge,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7416fc206b4a..cd0fda1fff6d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -529,18 +529,15 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
if (mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
- * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
- * a completely different MSR and bit dependent on family.
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
*/
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
+ if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ x86_amd_ssb_disable();
+ else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- break;
- case X86_VENDOR_AMD:
- x86_amd_ssb_disable();
- break;
}
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 95c8e507580d..910b47ee8078 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -803,6 +803,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_STIBP);
set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
}
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
}
void get_cpu_cap(struct cpuinfo_x86 *c)
@@ -992,7 +998,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
- !(ia32_cap & ARCH_CAP_SSB_NO))
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (x86_match_cpu(cpu_no_meltdown))
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 24bfa63e86cf..ec4754f81cbd 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -845,6 +845,8 @@ static __init void rdt_quirks(void)
case INTEL_FAM6_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
}
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 77e201301528..08286269fd24 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -70,7 +70,7 @@ static DEFINE_MUTEX(microcode_mutex);
/*
* Serialize late loading so that CPUs get updated one-by-one.
*/
-static DEFINE_SPINLOCK(update_lock);
+static DEFINE_RAW_SPINLOCK(update_lock);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
@@ -560,9 +560,9 @@ static int __reload_late(void *info)
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
return -1;
- spin_lock(&update_lock);
+ raw_spin_lock(&update_lock);
apply_microcode_local(&err);
- spin_unlock(&update_lock);
+ raw_spin_unlock(&update_lock);
/* siblings return UCODE_OK because their engine got updated already */
if (err > UCODE_NFOUND) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 558444b23923..c610f47373e4 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -106,17 +106,9 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
memset(line, 0, LINE_SIZE);
- length = len;
- length--;
-
- if (length > LINE_SIZE - 1)
- length = LINE_SIZE - 1;
-
+ length = strncpy_from_user(line, buf, LINE_SIZE - 1);
if (length < 0)
- return -EINVAL;
-
- if (copy_from_user(line, buf, length))
- return -EFAULT;
+ return length;
linelen = strlen(line);
ptr = line + linelen - 1;
@@ -149,17 +141,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return -EINVAL;
ptr = skip_spaces(ptr + 5);
- for (i = 0; i < MTRR_NUM_TYPES; ++i) {
- if (strcmp(ptr, mtrr_strings[i]))
- continue;
- base >>= PAGE_SHIFT;
- size >>= PAGE_SHIFT;
- err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
- if (err < 0)
- return err;
- return len;
- }
- return -EINVAL;
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
}
static long
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 8eeaa81de066..0a3e70fd00d6 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -9,10 +9,12 @@
* your option) any later version.
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/syscore_ops.h>
#include <asm/dma.h>
+#include <asm/x86_init.h>
/*
* This module just handles suspend/resume issues with the
@@ -49,6 +51,29 @@ static struct syscore_ops i8237_syscore_ops = {
static int __init i8237A_init_ops(void)
{
+ /*
+ * From SKL PCH onwards, the legacy DMA device is removed in which the
+ * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed
+ * as well. All removed ports must return 0xff for a inb() request.
+ *
+ * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting
+ * the presence of DMA device since it may be used by BIOS to decode
+ * LPC traffic for POST codes. Original LPC only decodes one byte of
+ * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x
+ * decoding.
+ */
+ if (dma_inb(DMA_PAGE_0) == 0xFF)
+ return -ENODEV;
+
+ /*
+ * It is not required to load this driver as newer SoC may not
+ * support 8237 DMA or bus mastering from LPC. Platform firmware
+ * must announce the support for such legacy devices via
+ * ACPI_FADT_LEGACY_DEVICES field in FADT table.
+ */
+ if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
+ return -ENODEV;
+
register_syscore_ops(&i8237_syscore_ops);
return 0;
}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 2c3a1b4294eb..74383a3780dc 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -317,15 +317,12 @@ void __init idt_setup_apic_and_irq_gates(void)
set_intr_gate(i, entry);
}
- for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
set_bit(i, system_vectors);
set_intr_gate(i, spurious_interrupt);
-#else
- entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
- set_intr_gate(i, entry);
-#endif
}
+#endif
}
/**
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 235fe6008ac8..b348a672f71d 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -33,9 +33,14 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.set_legacy_features();
}
+bool __init x86_pnpbios_disabled(void)
+{
+ return x86_platform.legacy.devices.pnpbios == 0;
+}
+
#if defined(CONFIG_PNPBIOS)
bool __init arch_pnpbios_disabled(void)
{
- return x86_platform.legacy.devices.pnpbios == 0;
+ return x86_pnpbios_disabled();
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5c623dfe39d1..2f86d883dd95 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1312,11 +1312,3 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
-
-void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return;
-
- seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
-}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 92bf2f2e7cdd..f4f30d0c25c4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -379,7 +379,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -664,7 +665,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx |= F(VIRT_SSBD);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(VIRT_SSBD);
break;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 26110c202b19..950ec50f77c3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4115,7 +4115,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
msr_info->data = svm->spec_ctrl;
@@ -4217,11 +4218,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
svm->spec_ctrl = data;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c3b527a9f95d..298ef1479240 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -3,6 +3,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
+#include <asm/mcsafe_test.h>
#include <asm/alternative-asm.h>
#include <asm/export.h>
@@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
ENDPROC(memcpy_orig)
#ifndef CONFIG_UML
+
+MCSAFE_TEST_CTL
+
/*
* __memcpy_mcsafe - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses.
@@ -206,6 +210,8 @@ ENTRY(__memcpy_mcsafe)
subl %ecx, %edx
.L_read_leading_bytes:
movb (%rsi), %al
+ MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
+ MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
@@ -221,6 +227,8 @@ ENTRY(__memcpy_mcsafe)
.L_read_words:
movq (%rsi), %r8
+ MCSAFE_TEST_SRC %rsi 8 .E_read_words
+ MCSAFE_TEST_DST %rdi 8 .E_write_words
.L_write_words:
movq %r8, (%rdi)
addq $8, %rsi
@@ -237,6 +245,8 @@ ENTRY(__memcpy_mcsafe)
movl %edx, %ecx
.L_read_trailing_bytes:
movb (%rsi), %al
+ MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
+ MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 1b2197d13832..7ae36868aed2 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -527,6 +527,7 @@ void __init sme_enable(struct boot_params *bp)
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
+ physical_mask &= ~sme_me_mask;
return;
}
@@ -561,4 +562,6 @@ void __init sme_enable(struct boot_params *bp)
sme_me_mask = 0;
else
sme_me_mask = active_by_default ? me_mask : 0;
+
+ physical_mask &= ~sme_me_mask;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ffc8c13c50e4..47b5951e592b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -8,6 +8,11 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+EXPORT_SYMBOL(physical_mask);
+#endif
+
#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
@@ -114,13 +119,12 @@ static inline void pgd_list_del(pgd_t *pgd)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
- BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
- virt_to_page(pgd)->index = (pgoff_t)mm;
+ virt_to_page(pgd)->pt_mm = mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
- return (struct mm_struct *)page->index;
+ return page->pt_mm;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b619598..59e123da580c 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,9 @@
#
# Arch-specific network modules
#
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+ifeq ($(CONFIG_X86_32),y)
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
deleted file mode 100644
index b33093f84528..000000000000
--- a/arch/x86/net/bpf_jit.S
+++ /dev/null
@@ -1,154 +0,0 @@
-/* bpf_jit.S : BPF JIT helper functions
- *
- * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-/*
- * Calling convention :
- * rbx : skb pointer (callee saved)
- * esi : offset of byte(s) to fetch in skb (can be scratched)
- * r10 : copy of skb->data
- * r9d : hlen = skb->len - skb->data_len
- */
-#define SKBDATA %r10
-#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
-
-#define FUNC(name) \
- .globl name; \
- .type name, @function; \
- name:
-
-FUNC(sk_load_word)
- test %esi,%esi
- js bpf_slow_path_word_neg
-
-FUNC(sk_load_word_positive_offset)
- mov %r9d,%eax # hlen
- sub %esi,%eax # hlen - offset
- cmp $3,%eax
- jle bpf_slow_path_word
- mov (SKBDATA,%rsi),%eax
- bswap %eax /* ntohl() */
- ret
-
-FUNC(sk_load_half)
- test %esi,%esi
- js bpf_slow_path_half_neg
-
-FUNC(sk_load_half_positive_offset)
- mov %r9d,%eax
- sub %esi,%eax # hlen - offset
- cmp $1,%eax
- jle bpf_slow_path_half
- movzwl (SKBDATA,%rsi),%eax
- rol $8,%ax # ntohs()
- ret
-
-FUNC(sk_load_byte)
- test %esi,%esi
- js bpf_slow_path_byte_neg
-
-FUNC(sk_load_byte_positive_offset)
- cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
- jle bpf_slow_path_byte
- movzbl (SKBDATA,%rsi),%eax
- ret
-
-/* rsi contains offset and can be scratched */
-#define bpf_slow_path_common(LEN) \
- lea 32(%rbp), %rdx;\
- FRAME_BEGIN; \
- mov %rbx, %rdi; /* arg1 == skb */ \
- push %r9; \
- push SKBDATA; \
-/* rsi already has offset */ \
- mov $LEN,%ecx; /* len */ \
- call skb_copy_bits; \
- test %eax,%eax; \
- pop SKBDATA; \
- pop %r9; \
- FRAME_END
-
-
-bpf_slow_path_word:
- bpf_slow_path_common(4)
- js bpf_error
- mov 32(%rbp),%eax
- bswap %eax
- ret
-
-bpf_slow_path_half:
- bpf_slow_path_common(2)
- js bpf_error
- mov 32(%rbp),%ax
- rol $8,%ax
- movzwl %ax,%eax
- ret
-
-bpf_slow_path_byte:
- bpf_slow_path_common(1)
- js bpf_error
- movzbl 32(%rbp),%eax
- ret
-
-#define sk_negative_common(SIZE) \
- FRAME_BEGIN; \
- mov %rbx, %rdi; /* arg1 == skb */ \
- push %r9; \
- push SKBDATA; \
-/* rsi already has offset */ \
- mov $SIZE,%edx; /* size */ \
- call bpf_internal_load_pointer_neg_helper; \
- test %rax,%rax; \
- pop SKBDATA; \
- pop %r9; \
- FRAME_END; \
- jz bpf_error
-
-bpf_slow_path_word_neg:
- cmp SKF_MAX_NEG_OFF, %esi /* test range */
- jl bpf_error /* offset lower -> error */
-
-FUNC(sk_load_word_negative_offset)
- sk_negative_common(4)
- mov (%rax), %eax
- bswap %eax
- ret
-
-bpf_slow_path_half_neg:
- cmp SKF_MAX_NEG_OFF, %esi
- jl bpf_error
-
-FUNC(sk_load_half_negative_offset)
- sk_negative_common(2)
- mov (%rax),%ax
- rol $8,%ax
- movzwl %ax,%eax
- ret
-
-bpf_slow_path_byte_neg:
- cmp SKF_MAX_NEG_OFF, %esi
- jl bpf_error
-
-FUNC(sk_load_byte_negative_offset)
- sk_negative_common(1)
- movzbl (%rax), %eax
- ret
-
-bpf_error:
-# force a return 0 from jit handler
- xor %eax,%eax
- mov (%rbp),%rbx
- mov 8(%rbp),%r13
- mov 16(%rbp),%r14
- mov 24(%rbp),%r15
- add $40, %rbp
- leaveq
- ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d765acedc05c..8fca446aaef6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,15 +17,6 @@
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
-/*
- * Assembly code in arch/x86/net/bpf_jit.S
- */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[];
-extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[];
-
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
if (len == 1)
@@ -107,9 +98,6 @@ static int bpf_size_to_x86_bytes(int bpf_size)
#define X86_JLE 0x7E
#define X86_JG 0x7F
-#define CHOOSE_LOAD_FUNC(K, func) \
- ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
@@ -120,8 +108,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
* register in load/store instructions, it always needs an
* extra byte of encoding and is callee saved.
*
- * R9 caches skb->len - skb->data_len
- * R10 caches skb->data, and used for blinding (if enabled)
+ * Also x86-64 register R9 is unused. x86-64 register R10 is
+ * used for blinding (if enabled).
*/
static const int reg2hex[] = {
[BPF_REG_0] = 0, /* RAX */
@@ -196,19 +184,15 @@ static void jit_fill_hole(void *area, unsigned int size)
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
- bool seen_ld_abs;
- bool seen_ax_reg;
};
/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
-#define AUX_STACK_SPACE \
- (32 /* Space for RBX, R13, R14, R15 */ + \
- 8 /* Space for skb_copy_bits() buffer */)
+#define AUX_STACK_SPACE 40 /* Space for RBX, R13, R14, R15, tailcnt */
-#define PROLOGUE_SIZE 37
+#define PROLOGUE_SIZE 37
/*
* Emit x86-64 prologue code for BPF program and check its size.
@@ -232,20 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
/* sub rbp, AUX_STACK_SPACE */
EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
- /* All classic BPF filters use R6(rbx) save it */
-
/* mov qword ptr [rbp+0],rbx */
EMIT4(0x48, 0x89, 0x5D, 0);
-
- /*
- * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
- * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
- * R8(R14). R9(R15) spill could be made conditional, but there is only
- * one 'bpf_error' return path out of helper functions inside bpf_jit.S
- * The overhead of extra spill is negligible for any filter other
- * than synthetic ones. Therefore not worth adding complexity.
- */
-
/* mov qword ptr [rbp+8],r13 */
EMIT4(0x4C, 0x89, 0x6D, 8);
/* mov qword ptr [rbp+16],r14 */
@@ -353,27 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog)
*pprog = prog;
}
-
-static void emit_load_skb_data_hlen(u8 **pprog)
-{
- u8 *prog = *pprog;
- int cnt = 0;
-
- /*
- * r9d = skb->len - skb->data_len (headlen)
- * r10 = skb->data
- */
- /* mov %r9d, off32(%rdi) */
- EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
-
- /* sub %r9d, off32(%rdi) */
- EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
-
- /* mov %r10, off32(%rdi) */
- EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
- *pprog = prog;
-}
-
static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
u32 dst_reg, const u32 imm32)
{
@@ -462,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
{
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
- bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
- bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0;
@@ -473,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
emit_prologue(&prog, bpf_prog->aux->stack_depth,
bpf_prog_was_classic(bpf_prog));
- if (seen_ld_abs)
- emit_load_skb_data_hlen(&prog);
-
for (i = 0; i < insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
u32 dst_reg = insn->dst_reg;
@@ -483,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
u8 b2 = 0, b3 = 0;
s64 jmp_offset;
u8 jmp_cond;
- bool reload_skb_data;
int ilen;
u8 *func;
- if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
- ctx->seen_ax_reg = seen_ax_reg = true;
-
switch (insn->code) {
/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
@@ -916,36 +858,12 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_CALL:
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
- if (seen_ld_abs) {
- reload_skb_data = bpf_helper_changes_pkt_data(func);
- if (reload_skb_data) {
- EMIT1(0x57); /* push %rdi */
- jmp_offset += 22; /* pop, mov, sub, mov */
- } else {
- EMIT2(0x41, 0x52); /* push %r10 */
- EMIT2(0x41, 0x51); /* push %r9 */
- /*
- * We need to adjust jmp offset, since
- * pop %r9, pop %r10 take 4 bytes after call insn
- */
- jmp_offset += 4;
- }
- }
if (!imm32 || !is_simm32(jmp_offset)) {
pr_err("unsupported BPF func %d addr %p image %p\n",
imm32, func, image);
return -EINVAL;
}
EMIT1_off32(0xE8, jmp_offset);
- if (seen_ld_abs) {
- if (reload_skb_data) {
- EMIT1(0x5F); /* pop %rdi */
- emit_load_skb_data_hlen(&prog);
- } else {
- EMIT2(0x41, 0x59); /* pop %r9 */
- EMIT2(0x41, 0x5A); /* pop %r10 */
- }
- }
break;
case BPF_JMP | BPF_TAIL_CALL:
@@ -1080,60 +998,6 @@ emit_jmp:
}
break;
- case BPF_LD | BPF_IND | BPF_W:
- func = sk_load_word;
- goto common_load;
- case BPF_LD | BPF_ABS | BPF_W:
- func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
-common_load:
- ctx->seen_ld_abs = seen_ld_abs = true;
- jmp_offset = func - (image + addrs[i]);
- if (!func || !is_simm32(jmp_offset)) {
- pr_err("unsupported BPF func %d addr %p image %p\n",
- imm32, func, image);
- return -EINVAL;
- }
- if (BPF_MODE(insn->code) == BPF_ABS) {
- /* mov %esi, imm32 */
- EMIT1_off32(0xBE, imm32);
- } else {
- /* mov %rsi, src_reg */
- EMIT_mov(BPF_REG_2, src_reg);
- if (imm32) {
- if (is_imm8(imm32))
- /* add %esi, imm8 */
- EMIT3(0x83, 0xC6, imm32);
- else
- /* add %esi, imm32 */
- EMIT2_off32(0x81, 0xC6, imm32);
- }
- }
- /*
- * skb pointer is in R6 (%rbx), it will be copied into
- * %rdi if skb_copy_bits() call is necessary.
- * sk_load_* helpers also use %r10 and %r9d.
- * See bpf_jit.S
- */
- if (seen_ax_reg)
- /* r10 = skb->data, mov %r10, off32(%rbx) */
- EMIT3_off32(0x4c, 0x8b, 0x93,
- offsetof(struct sk_buff, data));
- EMIT1_off32(0xE8, jmp_offset); /* call */
- break;
-
- case BPF_LD | BPF_IND | BPF_H:
- func = sk_load_half;
- goto common_load;
- case BPF_LD | BPF_ABS | BPF_H:
- func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
- goto common_load;
- case BPF_LD | BPF_IND | BPF_B:
- func = sk_load_byte;
- goto common_load;
- case BPF_LD | BPF_ABS | BPF_B:
- func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
- goto common_load;
-
case BPF_JMP | BPF_EXIT:
if (seen_exit) {
jmp_offset = ctx->cleanup_addr - addrs[i];
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000000..0cc04e30adc1
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2419 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ * high
+ * original ESP => +-----+
+ * | | callee saved registers
+ * +-----+
+ * | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP => +-----+
+ * | ... | eBPF prog stack
+ * +-----+
+ * |RSVD | JIT scratchpad
+ * current ESP => +-----+
+ * | |
+ * | ... | Function call stack
+ * | |
+ * +-----+
+ * low
+ *
+ * The callee saved registers:
+ *
+ * high
+ * original ESP => +------------------+ \
+ * | ebp | |
+ * current EBP => +------------------+ } callee saved registers
+ * | ebx,esi,edi | |
+ * +------------------+ /
+ * low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+ do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) \
+ EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+ do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+ return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k) (k)
+#define TCALL_CNT (MAX_BPF_JIT_REG + 0) /* Tail Call Count */
+
+#define IA32_EAX (0x0)
+#define IA32_EBX (0x3)
+#define IA32_ECX (0x1)
+#define IA32_EDX (0x2)
+#define IA32_ESI (0x6)
+#define IA32_EDI (0x7)
+#define IA32_EBP (0x5)
+#define IA32_ESP (0x4)
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB 0x72
+#define IA32_JAE 0x73
+#define IA32_JE 0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA 0x77
+#define IA32_JL 0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG 0x7F
+
+/*
+ * Map eBPF registers to IA32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ * registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ * mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
+ * registers, we have to map each eBPF registers with two IA32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+ /* Return value from in-kernel function, and exit value from eBPF */
+ [BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+ /* The arguments from eBPF program to in-kernel function */
+ /* Stored on stack scratch space */
+ [BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+ [BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+ [BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+ [BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+ [BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+ /* Callee saved registers that in-kernel function will preserve */
+ /* Stored on stack scratch space */
+ [BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+ [BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+ [BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+ [BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+ /* Read only Frame Pointer to access Stack */
+ [BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+ /* Temporary register for blinding constants. */
+ [BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+ /* Tail call count. Stored on stack scratch space. */
+ [TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo dst[0]
+#define dst_hi dst[1]
+#define src_lo src[0]
+#define src_hi src[1]
+
+#define STACK_ALIGNMENT 8
+/*
+ * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* Total stack size used in JITed code */
+#define _STACK_SIZE (stack_depth + SCRATCH_SIZE)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Encode 'dst_reg' register into IA32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + dst_reg;
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ /* Fill whole space with int3 instructions */
+ memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk) {
+ if (val == 0) {
+ /* xor eax,eax */
+ EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ } else {
+ EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst), val);
+ }
+ } else {
+ if (val == 0)
+ EMIT2(0x33, add_2reg(0xC0, dst, dst));
+ else
+ EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+ val);
+ }
+ *pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+ else
+ /* mov dst,sreg */
+ EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+ *pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+ const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+ if (is64)
+ /* complete 8 byte move */
+ emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+ else
+ /* zero out high 4 bytes */
+ emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+ const u32 val, bool dstk, u8 **pprog)
+{
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+ emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_ECX : src;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+ EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov dst,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk && val != 64) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /*
+ * Emit 'movzwl eax,ax' to zero extend 16-bit
+ * into 64 bit
+ */
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* nop */
+ break;
+ }
+
+ if (dstk && val != 64) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /* Emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* Emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* Emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* Emit 'bswap edx' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_hi));
+
+ /* mov ecx,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* mov dreg_lo,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+ break;
+ }
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+ /* xor edx,edx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+ /* div ecx */
+ EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+ if (op == BPF_MOD) {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+ } else {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+ }
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 b2;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ switch (op) {
+ case BPF_LSH:
+ b2 = 0xE0; break;
+ case BPF_RSH:
+ b2 = 0xE8; break;
+ case BPF_ARSH:
+ b2 = 0xF8; break;
+ default:
+ return;
+ }
+ EMIT2(0xD3, add_1reg(b2, dreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+ u8 dreg = dstk ? IA32_EDX : dst;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+ switch (BPF_OP(op)) {
+ /* dst = dst + src */
+ case BPF_ADD:
+ if (hi && is64)
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst - src */
+ case BPF_SUB:
+ if (hi && is64)
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst | src */
+ case BPF_OR:
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & src */
+ case BPF_AND:
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ src */
+ case BPF_XOR:
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+ const u8 dst[], const u8 src[],
+ bool dstk, bool sstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+ if (is64)
+ emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+ &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const s32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 sreg = IA32_EDX;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (!is_imm8(val))
+ /* mov edx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+ switch (op) {
+ /* dst = dst + val */
+ case BPF_ADD:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD0, dreg), val);
+ else
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC0, dreg), val);
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst - val */
+ case BPF_SUB:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD8, dreg), val);
+ else
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE8, dreg), val);
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst | val */
+ case BPF_OR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC8, dreg), val);
+ else
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & val */
+ case BPF_AND:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE0, dreg), val);
+ else
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ val */
+ case BPF_XOR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xF0, dreg), val);
+ else
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ case BPF_NEG:
+ EMIT2(0xF7, add_1reg(0xD8, dreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+ const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+
+ emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+ if (is64)
+ emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+ *pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* xor ecx,ecx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* sub dreg_lo,ecx */
+ EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
+ /* mov dreg_lo,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+ /* xor ecx,ecx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* sbb dreg_hi,ecx */
+ EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
+ /* mov dreg_hi,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* Jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* shl dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
+ /* mov ebx,dreg_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ /* shl dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shr ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+ /* or dreg_hi,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* Jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* shl dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* Jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* lshr dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* ashr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* Jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* ashr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* Jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* lshr dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* shr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* Jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* shr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do LSH operation */
+ if (val < 32) {
+ /* shl dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
+ /* mov ebx,dreg_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shr ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+ /* or dreg_hi,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shr dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shr dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ } else {
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_hi));
+ else
+ /* mov eax,dst_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+ else
+ /* mul src_hi */
+ EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+ /* add eax,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dst_lo,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dst_hi,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u32 hi;
+
+ hi = val & (1<<31) ? (u32)~0 : 0;
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+ else
+ /* mul dst_hi */
+ EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+ /* add ecx,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+struct jit_context {
+ int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE 128
+#define BPF_INSN_SAFETY 64
+
+#define PROLOGUE_SIZE 35
+
+/*
+ * Emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program.
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+ const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+ /* push ebp */
+ EMIT1(0x55);
+ /* mov ebp,esp */
+ EMIT2(0x89, 0xE5);
+ /* push edi */
+ EMIT1(0x57);
+ /* push esi */
+ EMIT1(0x56);
+ /* push ebx */
+ EMIT1(0x53);
+
+ /* sub esp,STACK_SIZE */
+ EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+ /* sub ebp,SCRATCH_SIZE+4+12*/
+ EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+ /* xor ebx,ebx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+ /* Set up BPF prog stack base register */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+ /* Move BPF_CTX (EAX) to BPF_REG_R1 */
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+ /* Initialize Tail Count */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ *pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ int cnt = 0;
+
+ /* mov eax,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+ /* mov edx,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+ /* add ebp,SCRATCH_SIZE+4+12*/
+ EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+ /* mov ebx,dword ptr [ebp-12]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+ /* mov esi,dword ptr [ebp-8]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+ /* mov edi,dword ptr [ebp-4]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ *pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ * if (index >= array->map.max_entries)
+ * goto out;
+ * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ * prog = array->ptrs[index];
+ * if (prog == NULL)
+ * goto out;
+ * goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+ u32 lo, hi;
+ static int jmp_label1 = -1;
+
+ /*
+ * if (index >= array->map.max_entries)
+ * goto out;
+ */
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+ /* cmp dword ptr [eax+off],edx */
+ EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+ offsetof(struct bpf_array, map.max_entries));
+ /* jbe out */
+ EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+ /*
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ lo = (u32)MAX_TAIL_CALL_CNT;
+ hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ /* cmp edx,hi */
+ EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
+ EMIT2(IA32_JNE, 3);
+ /* cmp ecx,lo */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
+
+ /* ja out */
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+
+ /* add eax,0x1 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
+ /* adc ebx,0x0 */
+ EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ /* prog = array->ptrs[index]; */
+ /* mov edx, [eax + edx * 4 + offsetof(...)] */
+ EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+ /*
+ * if (prog == NULL)
+ * goto out;
+ */
+ /* test edx,edx */
+ EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+ /* je out */
+ EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
+
+ /* goto *(prog->bpf_func + prologue_size); */
+ /* mov edx, dword ptr [edx + 32] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+ offsetof(struct bpf_prog, bpf_func));
+ /* add edx,prologue_size */
+ EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+ /*
+ * Now we're ready to jump into next BPF program:
+ * eax == ctx (1st arg)
+ * edx == prog->bpf_func + prologue_size
+ */
+ RETPOLINE_EDX_BPF_JIT();
+
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* out: */
+ *pprog = prog;
+}
+
+/* Push the scratch stack register on top of the stack. */
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+ /* push ecx */
+ EMIT1(0x51);
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ bool seen_exit = false;
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ int i, cnt = 0;
+ int proglen = 0;
+ u8 *prog = temp;
+
+ emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+ const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+ const u8 code = insn->code;
+ const u8 *dst = bpf2ia32[insn->dst_reg];
+ const u8 *src = bpf2ia32[insn->src_reg];
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (code) {
+ /* ALU operations */
+ /* dst = src */
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_X:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mov_r64(is64, dst, src, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* Sign-extend immediate value to dst reg */
+ emit_ia32_mov_i64(is64, dst, imm32,
+ dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = dst + src/imm */
+ /* dst = dst - src/imm */
+ /* dst = dst | src/imm */
+ /* dst = dst & src/imm */
+ /* dst = dst ^ src/imm */
+ /* dst = dst * src/imm */
+ /* dst = dst << src */
+ /* dst = dst >> src */
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+ src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+ imm32, dstk, &prog);
+ break;
+ }
+ break;
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r(dst_lo, src_lo, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+ dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst / src(imm) */
+ /* dst = dst % src(imm) */
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ src_lo, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ goto notyet;
+ /* dst = dst >> imm */
+ /* dst = dst << imm */
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 31))
+ return -EINVAL;
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst << imm */
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst >> imm */
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst << src */
+ case BPF_ALU64 | BPF_LSH | BPF_X:
+ emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src */
+ case BPF_ALU64 | BPF_RSH | BPF_X:
+ emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_X:
+ emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> imm (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = ~dst */
+ case BPF_ALU | BPF_NEG:
+ emit_ia32_alu_i(is64, false, BPF_OP(code),
+ dst_lo, 0, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = ~dst (64 bit) */
+ case BPF_ALU64 | BPF_NEG:
+ emit_ia32_neg64(dst, dstk, &prog);
+ break;
+ /* dst = dst * src/imm */
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = htole(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = htobe(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = imm64 */
+ case BPF_LD | BPF_IMM | BPF_DW: {
+ s32 hi, lo = imm32;
+
+ hi = insn[1].imm;
+ emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+ insn++;
+ i++;
+ break;
+ }
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0xC6, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0xC7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0xC7, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, IA32_EAX),
+ insn->off);
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ u32 hi;
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+ insn->off + 4);
+ EMIT(hi, 4);
+ }
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ case BPF_STX | BPF_MEM | BPF_H:
+ case BPF_STX | BPF_MEM | BPF_W:
+ case BPF_STX | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(src_lo));
+ else
+ /* mov edx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0x88, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0x89); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x89, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ if (sstk)
+ /* mov edi,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(src_hi));
+ else
+ /* mov edi,src_hi */
+ EMIT2(0x8B, add_2reg(0xC0, src_hi,
+ IA32_EDX));
+ EMIT1(0x89);
+ if (is_imm8(insn->off + 4)) {
+ EMIT2(add_2reg(0x40, IA32_EAX,
+ IA32_EDX),
+ insn->off + 4);
+ } else {
+ EMIT1(add_2reg(0x80, IA32_EAX,
+ IA32_EDX));
+ EMIT(insn->off + 4, 4);
+ }
+ }
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ case BPF_LDX | BPF_MEM | BPF_H:
+ case BPF_LDX | BPF_MEM | BPF_W:
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(src_lo));
+ else
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT2(0x0F, 0xB6); break;
+ case BPF_H:
+ EMIT2(0x0F, 0xB7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x8B, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov dst_lo,edx */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ case BPF_H:
+ case BPF_W:
+ if (dstk) {
+ EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst_hi));
+ EMIT(0x0, 4);
+ } else {
+ EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+ }
+ break;
+ case BPF_DW:
+ EMIT2_off32(0x8B,
+ add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off + 4);
+ if (dstk)
+ EMIT3(0x89,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
+ else
+ EMIT2(0x89,
+ add_2reg(0xC0, dst_hi, IA32_EDX));
+ break;
+ default:
+ break;
+ }
+ break;
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ {
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *r4 = bpf2ia32[BPF_REG_4];
+ const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto notyet;
+
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r1[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r1[1]));
+
+ emit_push_r64(r5, &prog);
+ emit_push_r64(r4, &prog);
+ emit_push_r64(r3, &prog);
+ emit_push_r64(r2, &prog);
+
+ EMIT1_off32(0xE8, jmp_offset + 9);
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r0[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r0[1]));
+
+ /* add esp,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+ break;
+ }
+ case BPF_JMP | BPF_TAIL_CALL:
+ emit_bpf_tail_call(&prog);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JLT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JLE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSLE | BPF_X:
+ case BPF_JMP | BPF_JSLT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSET | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSET | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp: /* Convert BPF opcode to x86 */
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ jmp_cond = IA32_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = IA32_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = IA32_JA;
+ break;
+ case BPF_JLT:
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = IA32_JB;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = IA32_JAE;
+ break;
+ case BPF_JLE:
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = IA32_JBE;
+ break;
+ case BPF_JSGT:
+ /* Signed '>', GT in x86 */
+ jmp_cond = IA32_JG;
+ break;
+ case BPF_JSLT:
+ /* Signed '<', LT in x86 */
+ jmp_cond = IA32_JL;
+ break;
+ case BPF_JSGE:
+ /* Signed '>=', GE in x86 */
+ jmp_cond = IA32_JGE;
+ break;
+ case BPF_JSLE:
+ /* Signed '<=', LE in x86 */
+ jmp_cond = IA32_JLE;
+ break;
+ default: /* to silence GCC warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+ }
+ case BPF_JMP | BPF_JA:
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+
+ if (!jmp_offset)
+ /* Optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+ /* STX XADD: lock *(u32 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* STX XADD: lock *(u64 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_DW:
+ goto notyet;
+ case BPF_JMP | BPF_EXIT:
+ if (seen_exit) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
+ }
+ seen_exit = true;
+ /* Update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+ break;
+notyet:
+ pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+ return -EFAULT;
+ default:
+ /*
+ * This error will be seen if new instruction was added
+ * to interpreter, but not to JIT or if there is junk in
+ * bpf_prog
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (ilen > BPF_MAX_INSN_SIZE) {
+ pr_err("bpf_jit: fatal insn size error\n");
+ return -EFAULT;
+ }
+
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpf_jit: fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ struct bpf_prog *tmp, *orig_prog = prog;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ bool tmp_blinded = false;
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!prog->jit_requested)
+ return orig_prog;
+
+ tmp = bpf_jit_blind_constants(prog);
+ /*
+ * If blinding was requested and we failed during blinding,
+ * we must fall back to the interpreter.
+ */
+ if (IS_ERR(tmp))
+ return orig_prog;
+ if (tmp != prog) {
+ tmp_blinded = true;
+ prog = tmp;
+ }
+
+ addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs) {
+ prog = orig_prog;
+ goto out;
+ }
+
+ /*
+ * Before first pass, make a rough estimation of addrs[]
+ * each BPF instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+
+ /*
+ * JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large BPF programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image.
+ */
+ for (pass = 0; pass < 20 || image; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+out_image:
+ image = NULL;
+ if (header)
+ bpf_jit_binary_free(header);
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ if (image) {
+ if (proglen != oldproglen) {
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ goto out_image;
+ }
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_jit_binary_alloc(proglen, &image,
+ 1, jit_fill_hole);
+ if (!header) {
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ }
+ oldproglen = proglen;
+ cond_resched();
+ }
+
+ if (bpf_jit_enable > 1)
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+ if (image) {
+ bpf_jit_binary_lock_ro(header);
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
+ prog->jited_len = proglen;
+ } else {
+ prog = orig_prog;
+ }
+
+out_addrs:
+ kfree(addrs);
+out:
+ if (tmp_blinded)
+ bpf_jit_prog_release_other(prog, prog == orig_prog ?
+ tmp : orig_prog);
+ return prog;
+}
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index f0114007e915..e5f753cbb1c3 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -59,24 +59,15 @@ int early_pci_allowed(void)
void early_dump_pci_device(u8 bus, u8 slot, u8 func)
{
+ u32 value[256 / 4];
int i;
- int j;
- u32 val;
- printk(KERN_INFO "pci 0000:%02x:%02x.%d config space:",
- bus, slot, func);
+ pr_info("pci 0000:%02x:%02x.%d config space:\n", bus, slot, func);
- for (i = 0; i < 256; i += 4) {
- if (!(i & 0x0f))
- printk("\n %02x:",i);
+ for (i = 0; i < 256; i += 4)
+ value[i / 4] = read_pci_config(bus, slot, func, i);
- val = read_pci_config(bus, slot, func, i);
- for (j = 0; j < 4; j++) {
- printk(" %02x", val & 0xff);
- val >>= 8;
- }
- }
- printk("\n");
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_OFFSET, 16, 1, value, 256, false);
}
void early_dump_pci_devices(void)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 54ef19e90705..13f4485ca388 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -636,6 +636,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334a, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334b, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334c, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334d, quirk_no_aersid);
#ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index b36caae0fb2f..b96d38288c60 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -615,7 +615,7 @@ static int uv2_3_wait_completion(struct bau_desc *bau_desc,
/* spin on the status MMR, waiting for it to go idle */
while (descriptor_stat != UV2H_DESC_IDLE) {
- if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
+ if (descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) {
/*
* A h/w bug on the destination side may
* have prevented the message being marked
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index e4cb9f4cde8a..fc13cbbb2dce 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -47,11 +47,6 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
static void uv_noop(struct irq_data *data) { }
-static void uv_ack_apic(struct irq_data *data)
-{
- ack_APIC_irq();
-}
-
static int
uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
@@ -73,7 +68,7 @@ static struct irq_chip uv_irq_chip = {
.name = "UV-CORE",
.irq_mask = uv_noop,
.irq_unmask = uv_noop,
- .irq_eoi = uv_ack_apic,
+ .irq_eoi = apic_ack_irq,
.irq_set_affinity = uv_set_irq_affinity,
};
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 13ed827c7c66..9d529f22fd9d 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -1,5 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-mainmenu "User Mode Linux/$SUBARCH $KERNELVERSION Kernel Configuration"
+mainmenu "User Mode Linux/$(SUBARCH) $(KERNELVERSION) Kernel Configuration"
+
+comment "Compiler: $(CC_VERSION_TEXT)"
+
+source "scripts/Kconfig.include"
source "arch/um/Kconfig.common"
@@ -16,8 +20,8 @@ config UML_X86
select GENERIC_FIND_FIRST_BIT
config 64BIT
- bool "64-bit kernel" if SUBARCH = "x86"
- default SUBARCH != "i386"
+ bool "64-bit kernel" if "$(SUBARCH)" = "x86"
+ default "$(SUBARCH)" != "i386"
config X86_32
def_bool !64BIT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2d76106788a3..96fc2f0fdbfe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -63,37 +63,44 @@ static noinline void xen_flush_tlb_all(void)
#define REMAP_BATCH_SIZE 16
struct remap_data {
- xen_pfn_t *mfn;
+ xen_pfn_t *pfn;
bool contiguous;
+ bool no_translate;
pgprot_t prot;
struct mmu_update *mmu_update;
};
-static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
unsigned long addr, void *data)
{
struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
+ pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
- /* If we have a contiguous range, just update the mfn itself,
- else update pointer to be "next mfn". */
+ /*
+ * If we have a contiguous range, just update the pfn itself,
+ * else update pointer to be "next pfn".
+ */
if (rmd->contiguous)
- (*rmd->mfn)++;
+ (*rmd->pfn)++;
else
- rmd->mfn++;
+ rmd->pfn++;
- rmd->mmu_update->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr |= rmd->no_translate ?
+ MMU_PT_UPDATE_NO_TRANSLATE :
+ MMU_NORMAL_PT_UPDATE;
rmd->mmu_update->val = pte_val_ma(pte);
rmd->mmu_update++;
return 0;
}
-static int do_remap_gfn(struct vm_area_struct *vma,
+static int do_remap_pfn(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t *gfn, int nr,
+ xen_pfn_t *pfn, int nr,
int *err_ptr, pgprot_t prot,
- unsigned domid,
+ unsigned int domid,
+ bool no_translate,
struct page **pages)
{
int err = 0;
@@ -104,11 +111,14 @@ static int do_remap_gfn(struct vm_area_struct *vma,
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
- rmd.mfn = gfn;
+ rmd.pfn = pfn;
rmd.prot = prot;
- /* We use the err_ptr to indicate if there we are doing a contiguous
- * mapping or a discontigious mapping. */
+ /*
+ * We use the err_ptr to indicate if there we are doing a contiguous
+ * mapping or a discontigious mapping.
+ */
rmd.contiguous = !err_ptr;
+ rmd.no_translate = no_translate;
while (nr) {
int index = 0;
@@ -119,7 +129,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
rmd.mmu_update = mmu_update;
err = apply_to_page_range(vma->vm_mm, addr, range,
- remap_area_mfn_pte_fn, &rmd);
+ remap_area_pfn_pte_fn, &rmd);
if (err)
goto out;
@@ -173,7 +183,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
if (xen_feature(XENFEAT_auto_translated_physmap))
return -EOPNOTSUPP;
- return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
+ return do_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false,
+ pages);
}
EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
@@ -192,10 +203,25 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
* cause of "wrong memory was mapped in".
*/
BUG_ON(err_ptr == NULL);
- return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
+ return do_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid,
+ false, pages);
}
EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *mfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned int domid, struct page **pages)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -EOPNOTSUPP;
+
+ return do_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid,
+ true, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
/* Returns: 0 success */
int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
int nr, struct page **pages)
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index e1a5fbeae08d..ca2d3b2bf2af 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -54,12 +54,19 @@
* charge of setting up it's own stack, GDT and IDT.
*/
+#define PVH_GDT_ENTRY_CS 1
+#define PVH_GDT_ENTRY_DS 2
+#define PVH_GDT_ENTRY_CANARY 3
+#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
+#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
+#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
+
ENTRY(pvh_start_xen)
cld
lgdt (_pa(gdt))
- mov $(__BOOT_DS),%eax
+ mov $PVH_DS_SEL,%eax
mov %eax,%ds
mov %eax,%es
mov %eax,%ss
@@ -93,11 +100,17 @@ ENTRY(pvh_start_xen)
mov %eax, %cr0
/* Jump to 64-bit mode. */
- ljmp $__KERNEL_CS, $_pa(1f)
+ ljmp $PVH_CS_SEL, $_pa(1f)
/* 64-bit entry point. */
.code64
1:
+ /* Set base address in stack canary descriptor. */
+ mov $MSR_GS_BASE,%ecx
+ mov $_pa(canary), %eax
+ xor %edx, %edx
+ wrmsr
+
call xen_prepare_pvh
/* startup_64 expects boot_params in %rsi. */
@@ -107,6 +120,17 @@ ENTRY(pvh_start_xen)
#else /* CONFIG_X86_64 */
+ /* Set base address in stack canary descriptor. */
+ movl $_pa(gdt_start),%eax
+ movl $_pa(canary),%ecx
+ movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
+ movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
+
+ mov $PVH_CANARY_SEL,%eax
+ mov %eax,%gs
+
call mk_early_pgtbl_32
mov $_pa(initial_page_table), %eax
@@ -116,13 +140,13 @@ ENTRY(pvh_start_xen)
or $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
- ljmp $__BOOT_CS, $1f
+ ljmp $PVH_CS_SEL, $1f
1:
call xen_prepare_pvh
mov $_pa(pvh_bootparams), %esi
/* startup_32 doesn't expect paging and PAE to be on. */
- ljmp $__BOOT_CS, $_pa(2f)
+ ljmp $PVH_CS_SEL, $_pa(2f)
2:
mov %cr0, %eax
and $~X86_CR0_PG, %eax
@@ -131,7 +155,7 @@ ENTRY(pvh_start_xen)
and $~X86_CR4_PAE, %eax
mov %eax, %cr4
- ljmp $__BOOT_CS, $_pa(startup_32)
+ ljmp $PVH_CS_SEL, $_pa(startup_32)
#endif
END(pvh_start_xen)
@@ -143,16 +167,19 @@ gdt:
.word 0
gdt_start:
.quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x0000000000000000 /* reserved */
#ifdef CONFIG_X86_64
- .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */
+ .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */
#else
- .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */
+ .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
#endif
- .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */
+ .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
+ .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */
gdt_end:
- .balign 4
+ .balign 16
+canary:
+ .fill 48, 1, 0
+
early_stack:
.fill 256, 1, 0
early_stack_end: