summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile8
-rw-r--r--arch/powerpc/lib/code-patching.c4
-rw-r--r--arch/powerpc/lib/crc-t10dif.c83
-rw-r--r--arch/powerpc/lib/crc-vpmsum-template.S746
-rw-r--r--arch/powerpc/lib/crc32.c93
-rw-r--r--arch/powerpc/lib/crc32c-vpmsum_asm.S842
-rw-r--r--arch/powerpc/lib/crct10dif-vpmsum_asm.S845
-rw-r--r--arch/powerpc/lib/crypto/Kconfig22
-rw-r--r--arch/powerpc/lib/crypto/Makefile10
-rw-r--r--arch/powerpc/lib/crypto/chacha-p10-glue.c100
-rw-r--r--arch/powerpc/lib/crypto/chacha-p10le-8x.S840
-rw-r--r--arch/powerpc/lib/crypto/poly1305-p10-glue.c96
-rw-r--r--arch/powerpc/lib/crypto/poly1305-p10le_64.S1075
-rw-r--r--arch/powerpc/lib/crypto/sha256-spe-asm.S318
-rw-r--r--arch/powerpc/lib/crypto/sha256.c70
-rw-r--r--arch/powerpc/lib/vmx-helper.c2
16 files changed, 5152 insertions, 2 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index f14ecab674a3..481f968e42c7 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -3,6 +3,8 @@
# Makefile for ppc-specific library files..
#
+obj-y += crypto/
+
CFLAGS_code-patching.o += -fno-stack-protector
CFLAGS_feature-fixups.o += -fno-stack-protector
@@ -78,4 +80,10 @@ CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec)
# Enable <altivec.h>
CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include)
+obj-$(CONFIG_CRC32_ARCH) += crc32-powerpc.o
+crc32-powerpc-y := crc32.o crc32c-vpmsum_asm.o
+
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-powerpc.o
+crc-t10dif-powerpc-y := crc-t10dif.o crct10dif-vpmsum_asm.o
+
obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index af97fbb3c257..f84e0337cc02 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -108,7 +108,7 @@ static int text_area_cpu_up(unsigned int cpu)
unsigned long addr;
int err;
- area = get_vm_area(PAGE_SIZE, VM_ALLOC);
+ area = get_vm_area(PAGE_SIZE, 0);
if (!area) {
WARN_ONCE(1, "Failed to create text area for cpu %d\n",
cpu);
@@ -493,7 +493,9 @@ static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool rep
orig_mm = start_using_temp_mm(patching_mm);
+ kasan_disable_current();
err = __patch_instructions(patch_addr, code, len, repeat_instr);
+ kasan_enable_current();
/* context synchronisation performed by __patch_instructions */
stop_using_temp_mm(patching_mm, orig_mm);
diff --git a/arch/powerpc/lib/crc-t10dif.c b/arch/powerpc/lib/crc-t10dif.c
new file mode 100644
index 000000000000..be23ded3a9df
--- /dev/null
+++ b/arch/powerpc/lib/crc-t10dif.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ */
+
+#include <asm/switch_to.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 64
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+ u32 crc = crci;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+ return crc_t10dif_generic(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc_t10dif_generic(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ crc <<= 16;
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ crc >>= 16;
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc_t10dif_generic(crc, p, tail);
+ }
+
+ return crc & 0xffff;
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_powerpc_init(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+ return 0;
+}
+subsys_initcall(crc_t10dif_powerpc_init);
+
+static void __exit crc_t10dif_powerpc_exit(void)
+{
+}
+module_exit(crc_t10dif_powerpc_exit);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/lib/crc-vpmsum-template.S b/arch/powerpc/lib/crc-vpmsum-template.S
new file mode 100644
index 000000000000..b0f87f595b26
--- /dev/null
+++ b/arch/powerpc/lib/crc-vpmsum-template.S
@@ -0,0 +1,746 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * https://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE 32768
+
+ .text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16 r25
+#define off32 r26
+#define off48 r27
+#define off64 r28
+#define off80 r29
+#define off96 r30
+#define off112 r31
+
+#define const1 v24
+#define const2 v25
+
+#define byteswap v26
+#define mask_32bit v27
+#define mask_64bit v28
+#define zeroes v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+ std r26,-48(r1)
+ std r25,-56(r1)
+
+ li off16,16
+ li off32,32
+ li off48,48
+ li off64,64
+ li off80,80
+ li off96,96
+ li off112,112
+ li r0,0
+
+ /* Enough room for saving 10 non volatile VMX registers */
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ stvx v20,0,r6
+ stvx v21,off16,r6
+ stvx v22,off32,r6
+ stvx v23,off48,r6
+ stvx v24,off64,r6
+ stvx v25,off80,r6
+ stvx v26,off96,r6
+ stvx v27,off112,r6
+ stvx v28,0,r7
+ stvx v29,off16,r7
+
+ mr r10,r3
+
+ vxor zeroes,zeroes,zeroes
+ vspltisw v0,-1
+
+ vsldoi mask_32bit,zeroes,v0,4
+ vsldoi mask_64bit,zeroes,v0,8
+
+ /* Get the initial value into v8 */
+ vxor v8,v8,v8
+ MTVRD(v8, R3)
+#ifdef REFLECT
+ vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
+#else
+ vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+ LOAD_REG_ADDR(r3, .byteswap_constant)
+ lvx byteswap,0,r3
+ addi r3,r3,16
+#endif
+
+ cmpdi r5,256
+ blt .Lshort
+
+ rldicr r6,r5,0,56
+
+ /* Checksum in blocks of MAX_SIZE */
+1: lis r7,MAX_SIZE@h
+ ori r7,r7,MAX_SIZE@l
+ mr r9,r7
+ cmpd r6,r7
+ bgt 2f
+ mr r7,r6
+2: subf r6,r7,r6
+
+ /* our main loop does 128 bytes at a time */
+ srdi r7,r7,7
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi r8,r7,4
+ srdi r9,r9,3
+ subf r8,r8,r9
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi r7,r7,-1
+ mtctr r7
+
+ LOAD_REG_ADDR(r3, .constants)
+
+ /* Find the start of our constants */
+ add r3,r3,r8
+
+ /* zero v0-v7 which will contain our checksums */
+ vxor v0,v0,v0
+ vxor v1,v1,v1
+ vxor v2,v2,v2
+ vxor v3,v3,v3
+ vxor v4,v4,v4
+ vxor v5,v5,v5
+ vxor v6,v6,v6
+ vxor v7,v7,v7
+
+ lvx const1,0,r3
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in v16-v23.
+ */
+ cmpdi r0,1
+ beq 2f
+
+ /* First warm up pass */
+ lvx v16,0,r4
+ lvx v17,off16,r4
+ VPERM(v16,v16,v16,byteswap)
+ VPERM(v17,v17,v17,byteswap)
+ lvx v18,off32,r4
+ lvx v19,off48,r4
+ VPERM(v18,v18,v18,byteswap)
+ VPERM(v19,v19,v19,byteswap)
+ lvx v20,off64,r4
+ lvx v21,off80,r4
+ VPERM(v20,v20,v20,byteswap)
+ VPERM(v21,v21,v21,byteswap)
+ lvx v22,off96,r4
+ lvx v23,off112,r4
+ VPERM(v22,v22,v22,byteswap)
+ VPERM(v23,v23,v23,byteswap)
+ addi r4,r4,8*16
+
+ /* xor in initial value */
+ vxor v16,v16,v8
+
+2: bdz .Lfirst_warm_up_done
+
+ addi r3,r3,16
+ lvx const2,0,r3
+
+ /* Second warm up pass */
+ VPMSUMD(v8,v16,const1)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v9,v17,const1)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v10,v18,const1)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v11,v19,const1)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdz .Lfirst_cool_down
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ .balign 16
+4: lvx const1,0,r3
+ addi r3,r3,16
+ ori r2,r2,0
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const2)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const2)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const2)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const2)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ lvx const2,0,r3
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdnz 4b
+
+.Lfirst_cool_down:
+ /* First cool down pass */
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const1)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const1)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const1)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const1)
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ ori r2,r2,0
+
+.Lsecond_cool_down:
+ /* Second cool down pass */
+ vxor v0,v0,v8
+ vxor v1,v1,v9
+ vxor v2,v2,v10
+ vxor v3,v3,v11
+ vxor v4,v4,v12
+ vxor v5,v5,v13
+ vxor v6,v6,v14
+ vxor v7,v7,v15
+
+#ifdef REFLECT
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi v0,v0,zeroes,4
+ vsldoi v1,v1,zeroes,4
+ vsldoi v2,v2,zeroes,4
+ vsldoi v3,v3,zeroes,4
+ vsldoi v4,v4,zeroes,4
+ vsldoi v5,v5,zeroes,4
+ vsldoi v6,v6,zeroes,4
+ vsldoi v7,v7,zeroes,4
+#endif
+
+ /* xor with last 1024 bits */
+ lvx v8,0,r4
+ lvx v9,off16,r4
+ VPERM(v8,v8,v8,byteswap)
+ VPERM(v9,v9,v9,byteswap)
+ lvx v10,off32,r4
+ lvx v11,off48,r4
+ VPERM(v10,v10,v10,byteswap)
+ VPERM(v11,v11,v11,byteswap)
+ lvx v12,off64,r4
+ lvx v13,off80,r4
+ VPERM(v12,v12,v12,byteswap)
+ VPERM(v13,v13,v13,byteswap)
+ lvx v14,off96,r4
+ lvx v15,off112,r4
+ VPERM(v14,v14,v14,byteswap)
+ VPERM(v15,v15,v15,byteswap)
+
+ addi r4,r4,8*16
+
+ vxor v16,v0,v8
+ vxor v17,v1,v9
+ vxor v18,v2,v10
+ vxor v19,v3,v11
+ vxor v20,v4,v12
+ vxor v21,v5,v13
+ vxor v22,v6,v14
+ vxor v23,v7,v15
+
+ li r0,1
+ cmpdi r6,0
+ addi r6,r6,128
+ bne 1b
+
+ /* Work out how many bytes we have left */
+ andi. r5,r5,127
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,128
+ add r3,r3,r6
+
+ /* How many 16 byte chunks are in the tail */
+ srdi r7,r5,4
+ mtctr r7
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx v0,0,r3
+ lvx v1,off16,r3
+ lvx v2,off32,r3
+ lvx v3,off48,r3
+ lvx v4,off64,r3
+ lvx v5,off80,r3
+ lvx v6,off96,r3
+ lvx v7,off112,r3
+ addi r3,r3,8*16
+
+ VPMSUMW(v0,v16,v0)
+ VPMSUMW(v1,v17,v1)
+ VPMSUMW(v2,v18,v2)
+ VPMSUMW(v3,v19,v3)
+ VPMSUMW(v4,v20,v4)
+ VPMSUMW(v5,v21,v5)
+ VPMSUMW(v6,v22,v6)
+ VPMSUMW(v7,v23,v7)
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi r7,0
+ beq 1f
+
+ lvx v16,0,r4
+ lvx v17,0,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off16,r4
+ lvx v17,off16,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off32,r4
+ lvx v17,off32,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off48,r4
+ lvx v17,off48,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off64,r4
+ lvx v17,off64,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off80,r4
+ lvx v17,off80,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off96,r4
+ lvx v17,off96,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+
+ /* Now xor all the parallel chunks together */
+1: vxor v0,v0,v1
+ vxor v2,v2,v3
+ vxor v4,v4,v5
+ vxor v6,v6,v7
+
+ vxor v0,v0,v2
+ vxor v4,v4,v6
+
+ vxor v0,v0,v4
+
+.Lbarrett_reduction:
+ /* Barrett constants */
+ LOAD_REG_ADDR(r3, .barrett_constants)
+
+ lvx const1,0,r3
+ lvx const2,off16,r3
+
+ vsldoi v1,v0,v0,8
+ vxor v0,v0,v1 /* xor two 64 bit results together */
+
+#ifdef REFLECT
+ /* shift left one bit */
+ vspltisb v1,1
+ vsl v0,v0,v1
+#endif
+
+ vand v0,v0,mask_64bit
+#ifndef REFLECT
+ /*
+ * Now for the Barrett reduction algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v1,v0,const1) /* ma */
+ vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
+#else
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v1,v0,mask_32bit /* bottom 32 bits of a */
+ VPMSUMD(v1,v1,const1) /* ma */
+ vand v1,v1,mask_32bit /* bottom 32bits of ma */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
+#endif
+
+ /* Get it into r3 */
+ MFVRD(R3, v0)
+
+.Lout:
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ lvx v20,0,r6
+ lvx v21,off16,r6
+ lvx v22,off32,r6
+ lvx v23,off48,r6
+ lvx v24,off64,r6
+ lvx v25,off80,r6
+ lvx v26,off96,r6
+ lvx v27,off112,r6
+ lvx v28,0,r7
+ lvx v29,off16,r7
+
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+
+ blr
+
+.Lfirst_warm_up_done:
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ VPMSUMD(v8,v16,const1)
+ VPMSUMD(v9,v17,const1)
+ VPMSUMD(v10,v18,const1)
+ VPMSUMD(v11,v19,const1)
+ VPMSUMD(v12,v20,const1)
+ VPMSUMD(v13,v21,const1)
+ VPMSUMD(v14,v22,const1)
+ VPMSUMD(v15,v23,const1)
+
+ b .Lsecond_cool_down
+
+.Lshort:
+ cmpdi r5,0
+ beq .Lzero
+
+ LOAD_REG_ADDR(r3, .short_constants)
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,256
+ add r3,r3,r6
+
+ /* How many 16 byte chunks? */
+ srdi r7,r5,4
+ mtctr r7
+
+ vxor v19,v19,v19
+ vxor v20,v20,v20
+
+ lvx v0,0,r4
+ lvx v16,0,r3
+ VPERM(v0,v0,v16,byteswap)
+ vxor v0,v0,v8 /* xor in initial value */
+ VPMSUMW(v0,v0,v16)
+ bdz .Lv0
+
+ lvx v1,off16,r4
+ lvx v17,off16,r3
+ VPERM(v1,v1,v17,byteswap)
+ VPMSUMW(v1,v1,v17)
+ bdz .Lv1
+
+ lvx v2,off32,r4
+ lvx v16,off32,r3
+ VPERM(v2,v2,v16,byteswap)
+ VPMSUMW(v2,v2,v16)
+ bdz .Lv2
+
+ lvx v3,off48,r4
+ lvx v17,off48,r3
+ VPERM(v3,v3,v17,byteswap)
+ VPMSUMW(v3,v3,v17)
+ bdz .Lv3
+
+ lvx v4,off64,r4
+ lvx v16,off64,r3
+ VPERM(v4,v4,v16,byteswap)
+ VPMSUMW(v4,v4,v16)
+ bdz .Lv4
+
+ lvx v5,off80,r4
+ lvx v17,off80,r3
+ VPERM(v5,v5,v17,byteswap)
+ VPMSUMW(v5,v5,v17)
+ bdz .Lv5
+
+ lvx v6,off96,r4
+ lvx v16,off96,r3
+ VPERM(v6,v6,v16,byteswap)
+ VPMSUMW(v6,v6,v16)
+ bdz .Lv6
+
+ lvx v7,off112,r4
+ lvx v17,off112,r3
+ VPERM(v7,v7,v17,byteswap)
+ VPMSUMW(v7,v7,v17)
+ bdz .Lv7
+
+ addi r3,r3,128
+ addi r4,r4,128
+
+ lvx v8,0,r4
+ lvx v16,0,r3
+ VPERM(v8,v8,v16,byteswap)
+ VPMSUMW(v8,v8,v16)
+ bdz .Lv8
+
+ lvx v9,off16,r4
+ lvx v17,off16,r3
+ VPERM(v9,v9,v17,byteswap)
+ VPMSUMW(v9,v9,v17)
+ bdz .Lv9
+
+ lvx v10,off32,r4
+ lvx v16,off32,r3
+ VPERM(v10,v10,v16,byteswap)
+ VPMSUMW(v10,v10,v16)
+ bdz .Lv10
+
+ lvx v11,off48,r4
+ lvx v17,off48,r3
+ VPERM(v11,v11,v17,byteswap)
+ VPMSUMW(v11,v11,v17)
+ bdz .Lv11
+
+ lvx v12,off64,r4
+ lvx v16,off64,r3
+ VPERM(v12,v12,v16,byteswap)
+ VPMSUMW(v12,v12,v16)
+ bdz .Lv12
+
+ lvx v13,off80,r4
+ lvx v17,off80,r3
+ VPERM(v13,v13,v17,byteswap)
+ VPMSUMW(v13,v13,v17)
+ bdz .Lv13
+
+ lvx v14,off96,r4
+ lvx v16,off96,r3
+ VPERM(v14,v14,v16,byteswap)
+ VPMSUMW(v14,v14,v16)
+ bdz .Lv14
+
+ lvx v15,off112,r4
+ lvx v17,off112,r3
+ VPERM(v15,v15,v17,byteswap)
+ VPMSUMW(v15,v15,v17)
+
+.Lv15: vxor v19,v19,v15
+.Lv14: vxor v20,v20,v14
+.Lv13: vxor v19,v19,v13
+.Lv12: vxor v20,v20,v12
+.Lv11: vxor v19,v19,v11
+.Lv10: vxor v20,v20,v10
+.Lv9: vxor v19,v19,v9
+.Lv8: vxor v20,v20,v8
+.Lv7: vxor v19,v19,v7
+.Lv6: vxor v20,v20,v6
+.Lv5: vxor v19,v19,v5
+.Lv4: vxor v20,v20,v4
+.Lv3: vxor v19,v19,v3
+.Lv2: vxor v20,v20,v2
+.Lv1: vxor v19,v19,v1
+.Lv0: vxor v20,v20,v0
+
+ vxor v0,v19,v20
+
+ b .Lbarrett_reduction
+
+.Lzero:
+ mr r3,r10
+ b .Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/lib/crc32.c b/arch/powerpc/lib/crc32.c
new file mode 100644
index 000000000000..0d9befb6e7b8
--- /dev/null
+++ b/arch/powerpc/lib/crc32.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/switch_to.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 512
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len);
+
+u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_le_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+ return crc32c_base(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc32c_base(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc32c_base(crc, p, tail);
+ }
+
+ return crc;
+}
+EXPORT_SYMBOL(crc32c_arch);
+
+u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_be_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+static int __init crc32_powerpc_init(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+ return 0;
+}
+subsys_initcall(crc32_powerpc_init);
+
+static void __exit crc32_powerpc_exit(void)
+{
+}
+module_exit(crc32_powerpc_exit);
+
+u32 crc32_optimizations(void)
+{
+ if (static_key_enabled(&have_vec_crypto))
+ return CRC32C_OPTIMIZATION;
+ return 0;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
+MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/lib/crc32c-vpmsum_asm.S b/arch/powerpc/lib/crc32c-vpmsum_asm.S
new file mode 100644
index 000000000000..1b35c55cce0a
--- /dev/null
+++ b/arch/powerpc/lib/crc32c-vpmsum_asm.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a crc32c with vpmsum acceleration
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ .octa 0x00000000b6ca9e20000000009c37c408
+
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ .octa 0x00000000350249a800000001b51df26c
+
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ .octa 0x00000001862dac54000000000724b9d0
+
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ .octa 0x00000001d87fb48c00000001c00532fe
+
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ .octa 0x00000001f39b699e00000000f05a9362
+
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ .octa 0x0000000101da11b400000001e1007970
+
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ .octa 0x00000001cab571e000000000a57366ee
+
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ .octa 0x00000000c7020cfe0000000192011284
+
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ .octa 0x00000000cdaed1ae0000000162716d9a
+
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ .octa 0x00000001e804effc00000000cd97ecde
+
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ .octa 0x0000000077c3ea3a0000000058812bc0
+
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ .octa 0x0000000068df31b40000000088b8c12e
+
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ .octa 0x00000000b059b6c200000001230b234c
+
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ .octa 0x0000000145fb8ed800000001120b416e
+
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ .octa 0x00000000cbc0916800000001974aecb0
+
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ .octa 0x000000005ceeedc2000000008ee3f226
+
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ .octa 0x0000000047d74e8600000001089aba9a
+
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ .octa 0x00000001407e9e220000000065113872
+
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ .octa 0x00000001da967bda000000005c07ec10
+
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ .octa 0x000000006c8983680000000187590924
+
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ .octa 0x00000000f2d14c9800000000e35da7c6
+
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ .octa 0x00000001993c6ad4000000000415855a
+
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ .octa 0x000000014683d1ac0000000073617758
+
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ .octa 0x00000001a7c93e6c0000000176021d28
+
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ .octa 0x000000010211e90a00000001c358fd0a
+
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ .octa 0x000000001119403e00000001ff7a2c18
+
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ .octa 0x000000001c3261aa00000000f2d9f7e4
+
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ .octa 0x000000014e37a634000000016cf1f9c8
+
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ .octa 0x0000000073786c0c000000010af9279a
+
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ .octa 0x000000011dc037f80000000004f101e8
+
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ .octa 0x0000000031433dfc0000000070bcf184
+
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ .octa 0x000000009cde8348000000000a8de642
+
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ .octa 0x0000000038d3c2a60000000062ea130c
+
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ .octa 0x000000011b25f26000000001eb31cbb2
+
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ .octa 0x000000001629e6f00000000170783448
+
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ .octa 0x0000000160838b4c00000001a684b4c6
+
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ .octa 0x000000007a44011c00000000253ca5b4
+
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ .octa 0x00000000226f417a0000000057b4b1e2
+
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ .octa 0x0000000045eb2eb400000000b6bd084c
+
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ .octa 0x000000014459d70c0000000123c2d592
+
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ .octa 0x00000001d406ed8200000000159dafce
+
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ .octa 0x0000000160c8e1a80000000127e1a64e
+
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ .octa 0x0000000027ba80980000000056860754
+
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ .octa 0x000000006d92d01800000001e661aae8
+
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ .octa 0x000000012ed7e3f200000000f82c6166
+
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ .octa 0x000000002dc8778800000000c4f9c7ae
+
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ .octa 0x0000000018240bb80000000074203d20
+
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ .octa 0x000000001ad381580000000198173052
+
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ .octa 0x00000001396b78f200000001ce8aba54
+
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ .octa 0x000000011a68133400000001850d5d94
+
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ .octa 0x000000012104732e00000001d609239c
+
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ .octa 0x00000000a140d90c000000001595f048
+
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ .octa 0x00000001b7215eda0000000042ccee08
+
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ .octa 0x00000001aaf1df3c000000010a389d74
+
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ .octa 0x0000000029d15b8a000000012a840da6
+
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ .octa 0x00000000f1a96922000000001d181c0c
+
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ .octa 0x00000001ac80d03c0000000068b7d1f6
+
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ .octa 0x000000000f11d56a000000005b0f14fc
+
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ .octa 0x00000001f1c022a20000000179e9e730
+
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ .octa 0x0000000173d00ae200000001ce1368d6
+
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ .octa 0x000000016edc5ae400000000de940fee
+
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ .octa 0x00000001f1a0214000000000fe896b7e
+
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ .octa 0x00000000ca0b28a000000001f797431c
+
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ .octa 0x00000001928e30a20000000053e989ba
+
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ .octa 0x0000000097b1b002000000003920cd16
+
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ .octa 0x00000000b15bf90600000001e6f579b8
+
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ .octa 0x00000000411c5d52000000007493cb0a
+
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ .octa 0x00000001c36f330000000001bdd376d8
+
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ .octa 0x00000001119227e0000000016badfee6
+
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ .octa 0x00000000114d47020000000071de5c58
+
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ .octa 0x00000000458b5b9800000000453f317c
+
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ .octa 0x000000012e31fb8e0000000121675cce
+
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ .octa 0x000000005cf619d800000001f409ee92
+
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ .octa 0x0000000063f4d8b200000000f36b9c88
+
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ .octa 0x000000004138dc8a0000000036b398f4
+
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ .octa 0x00000001d29ee8e000000001748f9adc
+
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ .octa 0x000000006a08ace800000001be94ec00
+
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ .octa 0x0000000127d4201000000000b74370d6
+
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ .octa 0x0000000019d76b6200000001174d0b98
+
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ .octa 0x00000001b1471f6e00000000befc06a4
+
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ .octa 0x00000001f64c19cc00000001ae125288
+
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ .octa 0x00000000003c0ea00000000095c19b34
+
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ .octa 0x000000014d73abf600000001a78496f2
+
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ .octa 0x00000001620eb84400000001ac5390a0
+
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ .octa 0x0000000147655048000000002a80ed6e
+
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ .octa 0x0000000067b5077e00000001fa9b0128
+
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ .octa 0x0000000010ffe20600000001ea94929e
+
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ .octa 0x000000000fee8f1e0000000125f4305c
+
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ .octa 0x00000001da26fbae00000001471e2002
+
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ .octa 0x00000001b3a8bd880000000132d2253a
+
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ .octa 0x00000000e8f3898e00000000f26b3592
+
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ .octa 0x0000000030f2a798000000013a826ef2
+
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ .octa 0x000000000fba10020000000081482c84
+
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ .octa 0x00000000bdb9bd7200000000e77307c2
+
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ .octa 0x00000000ef1f98a00000000017102100
+
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ .octa 0x00000000689c760200000000db406486
+
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ .octa 0x000000016d5fa5fe0000000192db7f88
+
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ .octa 0x0000000041e7b470000000007c09163e
+
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ .octa 0x00000001cbb6495e000000000adac060
+
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ .octa 0x000000010052a0b000000000bd8316ae
+
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ .octa 0x00000001d8effb5c000000019f09ab54
+
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ .octa 0x00000001d969853c0000000125155542
+
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ .octa 0x00000000523ccce2000000018fdb5882
+
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ .octa 0x000000001e2436bc00000000e794b3f4
+
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ .octa 0x00000000ddd1c3a2000000016f9bb022
+
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ .octa 0x0000000019fcfe3800000000290c9978
+
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ .octa 0x00000001ce95db640000000083c0f350
+
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ .octa 0x00000000af5828060000000173ea6628
+
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ .octa 0x00000001006388f600000001c8b4e00a
+
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ .octa 0x0000000179eca00a00000000de95d6aa
+
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ .octa 0x0000000122410a6a000000010b7f7248
+
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ .octa 0x000000004288e87c00000001326e3a06
+
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ .octa 0x000000016c5490da00000000bb62c2e6
+
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ .octa 0x00000001b4ce08a6000000011dfe763a
+
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ .octa 0x00000001466ba60c000000007bcca8e2
+
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ .octa 0x00000001f6c488a40000000186118faa
+
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ .octa 0x000000013bfb06820000000111a65a88
+
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ .octa 0x00000000690e9e54000000003565e1c4
+
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ .octa 0x00000000281346b6000000012ed02a82
+
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ .octa 0x000000015646402400000000c486ecfc
+
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ .octa 0x000000016063a8dc0000000001b951b2
+
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ .octa 0x0000000116a663620000000048143916
+
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ .octa 0x000000017e8aa4d200000001dc2ae124
+
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ .octa 0x00000001728eb10c00000001416c58d6
+
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ .octa 0x00000001b08fd7fa00000000a479744a
+
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ .octa 0x00000001092a16e80000000096ca3a26
+
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ .octa 0x00000000a505637c00000000ff223d4e
+
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ .octa 0x00000000d94869b2000000010e84da42
+
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ .octa 0x00000001c8b203ae00000001b61ba3d0
+
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ .octa 0x000000005704aea000000000680f2de8
+
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ .octa 0x000000012e295fa2000000008772a9a8
+
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ .octa 0x000000011d0908bc0000000155f295bc
+
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ .octa 0x0000000193ed97ea00000000595f9282
+
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ .octa 0x000000013a0f1c520000000164b1c25a
+
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ .octa 0x000000010c2c40c000000000fbd67c50
+
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ .octa 0x00000000ff6fac3e0000000096076268
+
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ .octa 0x000000017b3609c000000001d288e4cc
+
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ .octa 0x0000000088c8c92200000001eaac1bdc
+
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ .octa 0x00000001751baae600000001f1ea39e2
+
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ .octa 0x000000010795297200000001eb6506fc
+
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ .octa 0x0000000162b00abe000000010f806ffe
+
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ .octa 0x000000000d7b404c000000010408481e
+
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ .octa 0x00000000763b13d40000000188260534
+
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ .octa 0x00000000f6dc22d80000000058fc73e0
+
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ .octa 0x000000007daae06000000000391c59b8
+
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ .octa 0x000000013359ab7c000000018b638400
+
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ .octa 0x000000008add438a000000011738f5c4
+
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ .octa 0x00000001edbefdea000000008cf7c6da
+
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ .octa 0x000000004104e0f800000001ef97fb16
+
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ .octa 0x00000000b48a82220000000102130e20
+
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ .octa 0x00000001bcb4684400000000db968898
+
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ .octa 0x000000013293ce0a00000000b5047b5e
+
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ .octa 0x00000001710d0844000000010b90fdb2
+
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ .octa 0x0000000117907f6e000000004834a32e
+
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ .octa 0x0000000087ddf93e0000000059c8f2b0
+
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ .octa 0x000000005970e9b00000000122cec508
+
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ .octa 0x0000000185b2b7d0000000000a330cda
+
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ .octa 0x00000001dcee0efc000000014a47148c
+
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ .octa 0x0000000030da27220000000042c61cb8
+
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ .octa 0x000000012f925a180000000012fe6960
+
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ .octa 0x00000000dd2e357c00000000dbda2c20
+
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ .octa 0x00000000071c80de000000011122410c
+
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ .octa 0x000000011513140a00000000977b2070
+
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ .octa 0x00000001df876e8e000000014050438e
+
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ .octa 0x000000015f81d6ce0000000147c840e8
+
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ .octa 0x000000019dd94dbe00000001cc7c88ce
+
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ .octa 0x00000001373d206e00000001476b35a4
+
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ .octa 0x00000000668ccade000000013d52d508
+
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ .octa 0x00000001b192d268000000008e4be32e
+
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ .octa 0x00000000e30f3a7800000000024120fe
+
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ .octa 0x000000010ef1f7bc00000000ddecddb4
+
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ .octa 0x00000001f5ac738000000000d4d403bc
+
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ .octa 0x000000011822ea7000000001734b89aa
+
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ .octa 0x00000000c3a33848000000010e7a58d6
+
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ .octa 0x00000001bd151c2400000001f9f04e9c
+
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ .octa 0x0000000056002d7600000000b692225e
+
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ .octa 0x000000014657c4f4000000019b8d3f3e
+
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ .octa 0x0000000113742d7c00000001a874f11e
+
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ .octa 0x000000019c5920ba000000010d5a4254
+
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ .octa 0x000000005216d2d600000000bbb2f5d6
+
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ .octa 0x0000000136f5ad8a0000000179cc0e36
+
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ .octa 0x000000018b07beb600000001dca1da4a
+
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ .octa 0x00000000db1e93b000000000feb1a192
+
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ .octa 0x000000000b96fa3a00000000d1eeedd6
+
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ .octa 0x00000001d9968af0000000008fad9bb4
+
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ .octa 0x000000000e4a77a200000001884938e4
+
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ .octa 0x00000000508c2ac800000001bc2e9bc0
+
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ .octa 0x0000000021572a8000000001f9658a68
+
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ .octa 0x00000001b859daf2000000001b9224fc
+
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ .octa 0x000000016f7884740000000055b2fb84
+
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ .octa 0x00000001b438810e000000018b090348
+
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ .octa 0x00000001d977c20c0000000007ae47f8
+
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ .octa 0x00000000ebedb99a0000000172acbec0
+
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ .octa 0x00000001df9e9e9200000001c6e3ff20
+
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ .octa 0x00000001a4a3f95200000000e1b38744
+
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ .octa 0x00000000e2f5122000000000791585b2
+
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ .octa 0x000000004aa01f3e00000000ac53b894
+
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ .octa 0x000000000c9ca2aa00000001df48b2e0
+
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ .octa 0x000000015168231600000000049c1c62
+
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ .octa 0x0000000036fce78c000000017c460c12
+
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ .octa 0x000000009037dc10000000015be4da7e
+
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ .octa 0x00000000d3298582000000010f38f668
+
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ .octa 0x00000001b42e8ad60000000039f40a00
+
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ .octa 0x00000000142a983800000000bd4c10c4
+
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ .octa 0x0000000109c7f1900000000042db1d98
+
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ .octa 0x0000000056ff931000000001c905bae6
+
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ .octa 0x00000001594513aa00000000069d40ea
+
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ .octa 0x000000011dd5fc080000000047bedd46
+
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ .octa 0x00000001675f0cc20000000026396bf8
+
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ .octa 0x00000000d1c8dd4400000000379beb92
+
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ .octa 0x0000000115ebd3d8000000000abae54a
+
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ .octa 0x00000001ecbd0dac0000000007e6a128
+
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ .octa 0x00000000cdf67af2000000000ade29d2
+
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ .octa 0x000000004c01ff4c00000000f974c45c
+
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ .octa 0x00000000f2d8657e00000000e77ac60a
+
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ .octa 0x000000006bae74c40000000145895816
+
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ .octa 0x0000000152af8aa00000000038e362be
+
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ .octa 0x0000000004663802000000007f991a64
+
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ .octa 0x00000001ab2f5afc00000000fa366d3a
+
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ .octa 0x0000000074a4ebd400000001a2bb34f0
+
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ .octa 0x00000001d7ab3a4c0000000028a9981e
+
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ .octa 0x00000001a8da60c600000001dbc672be
+
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ .octa 0x000000013cf6382000000000b04d77f6
+
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ .octa 0x00000000bec12e1e0000000124400d96
+
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ .octa 0x00000001c6368010000000014ca4b414
+
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ .octa 0x00000001e6e78758000000012fe2c938
+
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ .octa 0x000000008d7f2b3c00000001faed01e6
+
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ .octa 0x000000016b4a156e000000007e80ecfe
+
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ .octa 0x00000001c63cfeb60000000098daee94
+
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ .octa 0x000000015f902670000000010a04edea
+
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ .octa 0x00000001cd5de11e00000001c00b4524
+
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ .octa 0x000000001acaec540000000170296550
+
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ .octa 0x000000002bd0ca780000000181afaa48
+
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ .octa 0x0000000032d63d5c0000000185a31ffa
+
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ .octa 0x000000001c6d4e4c000000002469f608
+
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ .octa 0x0000000106a60b92000000006980102a
+
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ .octa 0x00000000d3855e120000000111ea9ca8
+
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ .octa 0x00000000e312563600000001bd1d29ce
+
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ .octa 0x000000009e8f7ea400000001b34b9580
+
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ .octa 0x00000001c82e562c000000003076054e
+
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ .octa 0x00000000ca9f09ce000000012a608ea4
+
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ .octa 0x00000000c63764e600000000784d05fe
+
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ .octa 0x0000000168d2e49e000000016ef0d82a
+
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ .octa 0x00000000e986c1480000000075bda454
+
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ .octa 0x00000000cfb65894000000003dc0a1c4
+
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ .octa 0x0000000111cadee400000000e9a5d8be
+
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ .octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+ .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+ /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+ .octa 0x38e888d4844752a9963a18920246e2e6
+
+ /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+ .octa 0x42316c00730206ad419a441956993a31
+
+ /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+ .octa 0x543d5c543e65ddf9924752ba2b830011
+
+ /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+ .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+ /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+ .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+ /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+ .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+ /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+ .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+ /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+ .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+ /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+ .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+ /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+ .octa 0x041d37768cd75659817cdc5119b29a35
+
+ /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+ .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+ /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+ .octa 0x0e148e8252377a554f256efcb82be955
+
+ /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+ .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+ /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+ .octa 0x790606ff9957c0a65d27e147510ac59a
+
+ /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+ .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x00000000000000000000000105ec76f1
+
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
+#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crct10dif-vpmsum_asm.S b/arch/powerpc/lib/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000000..47a6266d89a8
--- /dev/null
+++ b/arch/powerpc/lib/crct10dif-vpmsum_asm.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a CRC T10DIF with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261184 mod p(x), x^261120 mod p(x) */
+ .octa 0x0000000056d300000000000052550000
+
+ /* x^260160 mod p(x), x^260096 mod p(x) */
+ .octa 0x00000000ee67000000000000a1e40000
+
+ /* x^259136 mod p(x), x^259072 mod p(x) */
+ .octa 0x0000000060830000000000004ad10000
+
+ /* x^258112 mod p(x), x^258048 mod p(x) */
+ .octa 0x000000008cfe0000000000009ab40000
+
+ /* x^257088 mod p(x), x^257024 mod p(x) */
+ .octa 0x000000003e93000000000000fdb50000
+
+ /* x^256064 mod p(x), x^256000 mod p(x) */
+ .octa 0x000000003c2000000000000045480000
+
+ /* x^255040 mod p(x), x^254976 mod p(x) */
+ .octa 0x00000000b1fc0000000000008d690000
+
+ /* x^254016 mod p(x), x^253952 mod p(x) */
+ .octa 0x00000000f82b00000000000024ad0000
+
+ /* x^252992 mod p(x), x^252928 mod p(x) */
+ .octa 0x0000000044420000000000009f1a0000
+
+ /* x^251968 mod p(x), x^251904 mod p(x) */
+ .octa 0x00000000e88c00000000000066ec0000
+
+ /* x^250944 mod p(x), x^250880 mod p(x) */
+ .octa 0x00000000385c000000000000c87d0000
+
+ /* x^249920 mod p(x), x^249856 mod p(x) */
+ .octa 0x000000003227000000000000c8ff0000
+
+ /* x^248896 mod p(x), x^248832 mod p(x) */
+ .octa 0x00000000a9a900000000000033440000
+
+ /* x^247872 mod p(x), x^247808 mod p(x) */
+ .octa 0x00000000abaa00000000000066eb0000
+
+ /* x^246848 mod p(x), x^246784 mod p(x) */
+ .octa 0x000000001ac3000000000000c4ef0000
+
+ /* x^245824 mod p(x), x^245760 mod p(x) */
+ .octa 0x0000000063f000000000000056f30000
+
+ /* x^244800 mod p(x), x^244736 mod p(x) */
+ .octa 0x0000000032cc00000000000002050000
+
+ /* x^243776 mod p(x), x^243712 mod p(x) */
+ .octa 0x00000000f8b5000000000000568e0000
+
+ /* x^242752 mod p(x), x^242688 mod p(x) */
+ .octa 0x000000008db100000000000064290000
+
+ /* x^241728 mod p(x), x^241664 mod p(x) */
+ .octa 0x0000000059ca0000000000006b660000
+
+ /* x^240704 mod p(x), x^240640 mod p(x) */
+ .octa 0x000000005f5c00000000000018f80000
+
+ /* x^239680 mod p(x), x^239616 mod p(x) */
+ .octa 0x0000000061af000000000000b6090000
+
+ /* x^238656 mod p(x), x^238592 mod p(x) */
+ .octa 0x00000000e29e000000000000099a0000
+
+ /* x^237632 mod p(x), x^237568 mod p(x) */
+ .octa 0x000000000975000000000000a8360000
+
+ /* x^236608 mod p(x), x^236544 mod p(x) */
+ .octa 0x0000000043900000000000004f570000
+
+ /* x^235584 mod p(x), x^235520 mod p(x) */
+ .octa 0x00000000f9cd000000000000134c0000
+
+ /* x^234560 mod p(x), x^234496 mod p(x) */
+ .octa 0x000000007c29000000000000ec380000
+
+ /* x^233536 mod p(x), x^233472 mod p(x) */
+ .octa 0x000000004c6a000000000000b0d10000
+
+ /* x^232512 mod p(x), x^232448 mod p(x) */
+ .octa 0x00000000e7290000000000007d3e0000
+
+ /* x^231488 mod p(x), x^231424 mod p(x) */
+ .octa 0x00000000f1ab000000000000f0b20000
+
+ /* x^230464 mod p(x), x^230400 mod p(x) */
+ .octa 0x0000000039db0000000000009c270000
+
+ /* x^229440 mod p(x), x^229376 mod p(x) */
+ .octa 0x000000005e2800000000000092890000
+
+ /* x^228416 mod p(x), x^228352 mod p(x) */
+ .octa 0x00000000d44e000000000000d5ee0000
+
+ /* x^227392 mod p(x), x^227328 mod p(x) */
+ .octa 0x00000000cd0a00000000000041f50000
+
+ /* x^226368 mod p(x), x^226304 mod p(x) */
+ .octa 0x00000000c5b400000000000010520000
+
+ /* x^225344 mod p(x), x^225280 mod p(x) */
+ .octa 0x00000000fd2100000000000042170000
+
+ /* x^224320 mod p(x), x^224256 mod p(x) */
+ .octa 0x000000002f2500000000000095c20000
+
+ /* x^223296 mod p(x), x^223232 mod p(x) */
+ .octa 0x000000001b0100000000000001ce0000
+
+ /* x^222272 mod p(x), x^222208 mod p(x) */
+ .octa 0x000000000d430000000000002aca0000
+
+ /* x^221248 mod p(x), x^221184 mod p(x) */
+ .octa 0x0000000030a6000000000000385e0000
+
+ /* x^220224 mod p(x), x^220160 mod p(x) */
+ .octa 0x00000000e37b0000000000006f7a0000
+
+ /* x^219200 mod p(x), x^219136 mod p(x) */
+ .octa 0x00000000873600000000000024320000
+
+ /* x^218176 mod p(x), x^218112 mod p(x) */
+ .octa 0x00000000e9fb000000000000bd9c0000
+
+ /* x^217152 mod p(x), x^217088 mod p(x) */
+ .octa 0x000000003b9500000000000054bc0000
+
+ /* x^216128 mod p(x), x^216064 mod p(x) */
+ .octa 0x00000000133e000000000000a4660000
+
+ /* x^215104 mod p(x), x^215040 mod p(x) */
+ .octa 0x00000000784500000000000079930000
+
+ /* x^214080 mod p(x), x^214016 mod p(x) */
+ .octa 0x00000000b9800000000000001bb80000
+
+ /* x^213056 mod p(x), x^212992 mod p(x) */
+ .octa 0x00000000687600000000000024400000
+
+ /* x^212032 mod p(x), x^211968 mod p(x) */
+ .octa 0x00000000aff300000000000029e10000
+
+ /* x^211008 mod p(x), x^210944 mod p(x) */
+ .octa 0x0000000024b50000000000005ded0000
+
+ /* x^209984 mod p(x), x^209920 mod p(x) */
+ .octa 0x0000000017e8000000000000b12e0000
+
+ /* x^208960 mod p(x), x^208896 mod p(x) */
+ .octa 0x00000000128400000000000026d20000
+
+ /* x^207936 mod p(x), x^207872 mod p(x) */
+ .octa 0x000000002115000000000000a32a0000
+
+ /* x^206912 mod p(x), x^206848 mod p(x) */
+ .octa 0x000000009595000000000000a1210000
+
+ /* x^205888 mod p(x), x^205824 mod p(x) */
+ .octa 0x00000000281e000000000000ee8b0000
+
+ /* x^204864 mod p(x), x^204800 mod p(x) */
+ .octa 0x0000000006010000000000003d0d0000
+
+ /* x^203840 mod p(x), x^203776 mod p(x) */
+ .octa 0x00000000e2b600000000000034e90000
+
+ /* x^202816 mod p(x), x^202752 mod p(x) */
+ .octa 0x000000001bd40000000000004cdb0000
+
+ /* x^201792 mod p(x), x^201728 mod p(x) */
+ .octa 0x00000000df2800000000000030e90000
+
+ /* x^200768 mod p(x), x^200704 mod p(x) */
+ .octa 0x0000000049c200000000000042590000
+
+ /* x^199744 mod p(x), x^199680 mod p(x) */
+ .octa 0x000000009b97000000000000df950000
+
+ /* x^198720 mod p(x), x^198656 mod p(x) */
+ .octa 0x000000006184000000000000da7b0000
+
+ /* x^197696 mod p(x), x^197632 mod p(x) */
+ .octa 0x00000000461700000000000012510000
+
+ /* x^196672 mod p(x), x^196608 mod p(x) */
+ .octa 0x000000009b40000000000000f37e0000
+
+ /* x^195648 mod p(x), x^195584 mod p(x) */
+ .octa 0x00000000eeb2000000000000ecf10000
+
+ /* x^194624 mod p(x), x^194560 mod p(x) */
+ .octa 0x00000000b2e800000000000050f20000
+
+ /* x^193600 mod p(x), x^193536 mod p(x) */
+ .octa 0x00000000f59a000000000000e0b30000
+
+ /* x^192576 mod p(x), x^192512 mod p(x) */
+ .octa 0x00000000467f0000000000004d5a0000
+
+ /* x^191552 mod p(x), x^191488 mod p(x) */
+ .octa 0x00000000da92000000000000bb010000
+
+ /* x^190528 mod p(x), x^190464 mod p(x) */
+ .octa 0x000000001e1000000000000022a40000
+
+ /* x^189504 mod p(x), x^189440 mod p(x) */
+ .octa 0x0000000058fe000000000000836f0000
+
+ /* x^188480 mod p(x), x^188416 mod p(x) */
+ .octa 0x00000000b9ce000000000000d78d0000
+
+ /* x^187456 mod p(x), x^187392 mod p(x) */
+ .octa 0x0000000022210000000000004f8d0000
+
+ /* x^186432 mod p(x), x^186368 mod p(x) */
+ .octa 0x00000000744600000000000033760000
+
+ /* x^185408 mod p(x), x^185344 mod p(x) */
+ .octa 0x000000001c2e000000000000a1e50000
+
+ /* x^184384 mod p(x), x^184320 mod p(x) */
+ .octa 0x00000000dcc8000000000000a1a40000
+
+ /* x^183360 mod p(x), x^183296 mod p(x) */
+ .octa 0x00000000910f00000000000019a20000
+
+ /* x^182336 mod p(x), x^182272 mod p(x) */
+ .octa 0x0000000055d5000000000000f6ae0000
+
+ /* x^181312 mod p(x), x^181248 mod p(x) */
+ .octa 0x00000000c8ba000000000000a7ac0000
+
+ /* x^180288 mod p(x), x^180224 mod p(x) */
+ .octa 0x0000000031f8000000000000eea20000
+
+ /* x^179264 mod p(x), x^179200 mod p(x) */
+ .octa 0x000000001966000000000000c4d90000
+
+ /* x^178240 mod p(x), x^178176 mod p(x) */
+ .octa 0x00000000b9810000000000002b470000
+
+ /* x^177216 mod p(x), x^177152 mod p(x) */
+ .octa 0x000000008303000000000000f7cf0000
+
+ /* x^176192 mod p(x), x^176128 mod p(x) */
+ .octa 0x000000002ce500000000000035b30000
+
+ /* x^175168 mod p(x), x^175104 mod p(x) */
+ .octa 0x000000002fae0000000000000c7c0000
+
+ /* x^174144 mod p(x), x^174080 mod p(x) */
+ .octa 0x00000000f50c0000000000009edf0000
+
+ /* x^173120 mod p(x), x^173056 mod p(x) */
+ .octa 0x00000000714f00000000000004cd0000
+
+ /* x^172096 mod p(x), x^172032 mod p(x) */
+ .octa 0x00000000c161000000000000541b0000
+
+ /* x^171072 mod p(x), x^171008 mod p(x) */
+ .octa 0x0000000021c8000000000000e2700000
+
+ /* x^170048 mod p(x), x^169984 mod p(x) */
+ .octa 0x00000000b93d00000000000009a60000
+
+ /* x^169024 mod p(x), x^168960 mod p(x) */
+ .octa 0x00000000fbcf000000000000761c0000
+
+ /* x^168000 mod p(x), x^167936 mod p(x) */
+ .octa 0x0000000026350000000000009db30000
+
+ /* x^166976 mod p(x), x^166912 mod p(x) */
+ .octa 0x00000000b64f0000000000003e9f0000
+
+ /* x^165952 mod p(x), x^165888 mod p(x) */
+ .octa 0x00000000bd0e00000000000078590000
+
+ /* x^164928 mod p(x), x^164864 mod p(x) */
+ .octa 0x00000000d9360000000000008bc80000
+
+ /* x^163904 mod p(x), x^163840 mod p(x) */
+ .octa 0x000000002f140000000000008c9f0000
+
+ /* x^162880 mod p(x), x^162816 mod p(x) */
+ .octa 0x000000006a270000000000006af70000
+
+ /* x^161856 mod p(x), x^161792 mod p(x) */
+ .octa 0x000000006685000000000000e5210000
+
+ /* x^160832 mod p(x), x^160768 mod p(x) */
+ .octa 0x0000000062da00000000000008290000
+
+ /* x^159808 mod p(x), x^159744 mod p(x) */
+ .octa 0x00000000bb4b000000000000e4d00000
+
+ /* x^158784 mod p(x), x^158720 mod p(x) */
+ .octa 0x00000000d2490000000000004ae10000
+
+ /* x^157760 mod p(x), x^157696 mod p(x) */
+ .octa 0x00000000c85b00000000000000e70000
+
+ /* x^156736 mod p(x), x^156672 mod p(x) */
+ .octa 0x00000000c37a00000000000015650000
+
+ /* x^155712 mod p(x), x^155648 mod p(x) */
+ .octa 0x0000000018530000000000001c2f0000
+
+ /* x^154688 mod p(x), x^154624 mod p(x) */
+ .octa 0x00000000b46600000000000037bd0000
+
+ /* x^153664 mod p(x), x^153600 mod p(x) */
+ .octa 0x00000000439b00000000000012190000
+
+ /* x^152640 mod p(x), x^152576 mod p(x) */
+ .octa 0x00000000b1260000000000005ece0000
+
+ /* x^151616 mod p(x), x^151552 mod p(x) */
+ .octa 0x00000000d8110000000000002a5e0000
+
+ /* x^150592 mod p(x), x^150528 mod p(x) */
+ .octa 0x00000000099f00000000000052330000
+
+ /* x^149568 mod p(x), x^149504 mod p(x) */
+ .octa 0x00000000f9f9000000000000f9120000
+
+ /* x^148544 mod p(x), x^148480 mod p(x) */
+ .octa 0x000000005cc00000000000000ddc0000
+
+ /* x^147520 mod p(x), x^147456 mod p(x) */
+ .octa 0x00000000343b00000000000012200000
+
+ /* x^146496 mod p(x), x^146432 mod p(x) */
+ .octa 0x000000009222000000000000d12b0000
+
+ /* x^145472 mod p(x), x^145408 mod p(x) */
+ .octa 0x00000000d781000000000000eb2d0000
+
+ /* x^144448 mod p(x), x^144384 mod p(x) */
+ .octa 0x000000000bf400000000000058970000
+
+ /* x^143424 mod p(x), x^143360 mod p(x) */
+ .octa 0x00000000094200000000000013690000
+
+ /* x^142400 mod p(x), x^142336 mod p(x) */
+ .octa 0x00000000d55100000000000051950000
+
+ /* x^141376 mod p(x), x^141312 mod p(x) */
+ .octa 0x000000008f11000000000000954b0000
+
+ /* x^140352 mod p(x), x^140288 mod p(x) */
+ .octa 0x00000000140f000000000000b29e0000
+
+ /* x^139328 mod p(x), x^139264 mod p(x) */
+ .octa 0x00000000c6db000000000000db5d0000
+
+ /* x^138304 mod p(x), x^138240 mod p(x) */
+ .octa 0x00000000715b000000000000dfaf0000
+
+ /* x^137280 mod p(x), x^137216 mod p(x) */
+ .octa 0x000000000dea000000000000e3b60000
+
+ /* x^136256 mod p(x), x^136192 mod p(x) */
+ .octa 0x000000006f94000000000000ddaf0000
+
+ /* x^135232 mod p(x), x^135168 mod p(x) */
+ .octa 0x0000000024e1000000000000e4f70000
+
+ /* x^134208 mod p(x), x^134144 mod p(x) */
+ .octa 0x000000008810000000000000aa110000
+
+ /* x^133184 mod p(x), x^133120 mod p(x) */
+ .octa 0x0000000030c2000000000000a8e60000
+
+ /* x^132160 mod p(x), x^132096 mod p(x) */
+ .octa 0x00000000e6d0000000000000ccf30000
+
+ /* x^131136 mod p(x), x^131072 mod p(x) */
+ .octa 0x000000004da000000000000079bf0000
+
+ /* x^130112 mod p(x), x^130048 mod p(x) */
+ .octa 0x000000007759000000000000b3a30000
+
+ /* x^129088 mod p(x), x^129024 mod p(x) */
+ .octa 0x00000000597400000000000028790000
+
+ /* x^128064 mod p(x), x^128000 mod p(x) */
+ .octa 0x000000007acd000000000000b5820000
+
+ /* x^127040 mod p(x), x^126976 mod p(x) */
+ .octa 0x00000000e6e400000000000026ad0000
+
+ /* x^126016 mod p(x), x^125952 mod p(x) */
+ .octa 0x000000006d49000000000000985b0000
+
+ /* x^124992 mod p(x), x^124928 mod p(x) */
+ .octa 0x000000000f0800000000000011520000
+
+ /* x^123968 mod p(x), x^123904 mod p(x) */
+ .octa 0x000000002c7f000000000000846c0000
+
+ /* x^122944 mod p(x), x^122880 mod p(x) */
+ .octa 0x000000005ce7000000000000ae1d0000
+
+ /* x^121920 mod p(x), x^121856 mod p(x) */
+ .octa 0x00000000d4cb000000000000e21d0000
+
+ /* x^120896 mod p(x), x^120832 mod p(x) */
+ .octa 0x000000003a2300000000000019bb0000
+
+ /* x^119872 mod p(x), x^119808 mod p(x) */
+ .octa 0x000000000e1700000000000095290000
+
+ /* x^118848 mod p(x), x^118784 mod p(x) */
+ .octa 0x000000006e6400000000000050d20000
+
+ /* x^117824 mod p(x), x^117760 mod p(x) */
+ .octa 0x000000008d5c0000000000000cd10000
+
+ /* x^116800 mod p(x), x^116736 mod p(x) */
+ .octa 0x00000000ef310000000000007b570000
+
+ /* x^115776 mod p(x), x^115712 mod p(x) */
+ .octa 0x00000000645d00000000000053d60000
+
+ /* x^114752 mod p(x), x^114688 mod p(x) */
+ .octa 0x0000000018fc00000000000077510000
+
+ /* x^113728 mod p(x), x^113664 mod p(x) */
+ .octa 0x000000000cb3000000000000a7b70000
+
+ /* x^112704 mod p(x), x^112640 mod p(x) */
+ .octa 0x00000000991b000000000000d0780000
+
+ /* x^111680 mod p(x), x^111616 mod p(x) */
+ .octa 0x00000000845a000000000000be3c0000
+
+ /* x^110656 mod p(x), x^110592 mod p(x) */
+ .octa 0x00000000d3a9000000000000df020000
+
+ /* x^109632 mod p(x), x^109568 mod p(x) */
+ .octa 0x0000000017d7000000000000063e0000
+
+ /* x^108608 mod p(x), x^108544 mod p(x) */
+ .octa 0x000000007a860000000000008ab40000
+
+ /* x^107584 mod p(x), x^107520 mod p(x) */
+ .octa 0x00000000fd7c000000000000c7bd0000
+
+ /* x^106560 mod p(x), x^106496 mod p(x) */
+ .octa 0x00000000a56b000000000000efd60000
+
+ /* x^105536 mod p(x), x^105472 mod p(x) */
+ .octa 0x0000000010e400000000000071380000
+
+ /* x^104512 mod p(x), x^104448 mod p(x) */
+ .octa 0x00000000994500000000000004d30000
+
+ /* x^103488 mod p(x), x^103424 mod p(x) */
+ .octa 0x00000000b83c0000000000003b0e0000
+
+ /* x^102464 mod p(x), x^102400 mod p(x) */
+ .octa 0x00000000d6c10000000000008b020000
+
+ /* x^101440 mod p(x), x^101376 mod p(x) */
+ .octa 0x000000009efc000000000000da940000
+
+ /* x^100416 mod p(x), x^100352 mod p(x) */
+ .octa 0x000000005e87000000000000f9f70000
+
+ /* x^99392 mod p(x), x^99328 mod p(x) */
+ .octa 0x000000006c9b00000000000045e40000
+
+ /* x^98368 mod p(x), x^98304 mod p(x) */
+ .octa 0x00000000178a00000000000083940000
+
+ /* x^97344 mod p(x), x^97280 mod p(x) */
+ .octa 0x00000000f0c8000000000000f0a00000
+
+ /* x^96320 mod p(x), x^96256 mod p(x) */
+ .octa 0x00000000f699000000000000b74b0000
+
+ /* x^95296 mod p(x), x^95232 mod p(x) */
+ .octa 0x00000000316d000000000000c1cf0000
+
+ /* x^94272 mod p(x), x^94208 mod p(x) */
+ .octa 0x00000000987e00000000000072680000
+
+ /* x^93248 mod p(x), x^93184 mod p(x) */
+ .octa 0x00000000acff000000000000e0ab0000
+
+ /* x^92224 mod p(x), x^92160 mod p(x) */
+ .octa 0x00000000a1f6000000000000c5a80000
+
+ /* x^91200 mod p(x), x^91136 mod p(x) */
+ .octa 0x0000000061bd000000000000cf690000
+
+ /* x^90176 mod p(x), x^90112 mod p(x) */
+ .octa 0x00000000c9f2000000000000cbcc0000
+
+ /* x^89152 mod p(x), x^89088 mod p(x) */
+ .octa 0x000000005a33000000000000de050000
+
+ /* x^88128 mod p(x), x^88064 mod p(x) */
+ .octa 0x00000000e416000000000000ccd70000
+
+ /* x^87104 mod p(x), x^87040 mod p(x) */
+ .octa 0x0000000058930000000000002f670000
+
+ /* x^86080 mod p(x), x^86016 mod p(x) */
+ .octa 0x00000000a9d3000000000000152f0000
+
+ /* x^85056 mod p(x), x^84992 mod p(x) */
+ .octa 0x00000000c114000000000000ecc20000
+
+ /* x^84032 mod p(x), x^83968 mod p(x) */
+ .octa 0x00000000b9270000000000007c890000
+
+ /* x^83008 mod p(x), x^82944 mod p(x) */
+ .octa 0x000000002e6000000000000006ee0000
+
+ /* x^81984 mod p(x), x^81920 mod p(x) */
+ .octa 0x00000000dfc600000000000009100000
+
+ /* x^80960 mod p(x), x^80896 mod p(x) */
+ .octa 0x000000004911000000000000ad4e0000
+
+ /* x^79936 mod p(x), x^79872 mod p(x) */
+ .octa 0x00000000ae1b000000000000b04d0000
+
+ /* x^78912 mod p(x), x^78848 mod p(x) */
+ .octa 0x0000000005fa000000000000e9900000
+
+ /* x^77888 mod p(x), x^77824 mod p(x) */
+ .octa 0x0000000004a1000000000000cc6f0000
+
+ /* x^76864 mod p(x), x^76800 mod p(x) */
+ .octa 0x00000000af73000000000000ed110000
+
+ /* x^75840 mod p(x), x^75776 mod p(x) */
+ .octa 0x0000000082530000000000008f7e0000
+
+ /* x^74816 mod p(x), x^74752 mod p(x) */
+ .octa 0x00000000cfdc000000000000594f0000
+
+ /* x^73792 mod p(x), x^73728 mod p(x) */
+ .octa 0x00000000a6b6000000000000a8750000
+
+ /* x^72768 mod p(x), x^72704 mod p(x) */
+ .octa 0x00000000fd76000000000000aa0c0000
+
+ /* x^71744 mod p(x), x^71680 mod p(x) */
+ .octa 0x0000000006f500000000000071db0000
+
+ /* x^70720 mod p(x), x^70656 mod p(x) */
+ .octa 0x0000000037ca000000000000ab0c0000
+
+ /* x^69696 mod p(x), x^69632 mod p(x) */
+ .octa 0x00000000d7ab000000000000b7a00000
+
+ /* x^68672 mod p(x), x^68608 mod p(x) */
+ .octa 0x00000000440800000000000090d30000
+
+ /* x^67648 mod p(x), x^67584 mod p(x) */
+ .octa 0x00000000186100000000000054730000
+
+ /* x^66624 mod p(x), x^66560 mod p(x) */
+ .octa 0x000000007368000000000000a3a20000
+
+ /* x^65600 mod p(x), x^65536 mod p(x) */
+ .octa 0x0000000026d0000000000000f9040000
+
+ /* x^64576 mod p(x), x^64512 mod p(x) */
+ .octa 0x00000000fe770000000000009c0a0000
+
+ /* x^63552 mod p(x), x^63488 mod p(x) */
+ .octa 0x000000002cba000000000000d1e70000
+
+ /* x^62528 mod p(x), x^62464 mod p(x) */
+ .octa 0x00000000f8bd0000000000005ac10000
+
+ /* x^61504 mod p(x), x^61440 mod p(x) */
+ .octa 0x000000007372000000000000d68d0000
+
+ /* x^60480 mod p(x), x^60416 mod p(x) */
+ .octa 0x00000000f37f00000000000089f60000
+
+ /* x^59456 mod p(x), x^59392 mod p(x) */
+ .octa 0x00000000078400000000000008a90000
+
+ /* x^58432 mod p(x), x^58368 mod p(x) */
+ .octa 0x00000000d3e400000000000042360000
+
+ /* x^57408 mod p(x), x^57344 mod p(x) */
+ .octa 0x00000000eba800000000000092d50000
+
+ /* x^56384 mod p(x), x^56320 mod p(x) */
+ .octa 0x00000000afbe000000000000b4d50000
+
+ /* x^55360 mod p(x), x^55296 mod p(x) */
+ .octa 0x00000000d8ca000000000000c9060000
+
+ /* x^54336 mod p(x), x^54272 mod p(x) */
+ .octa 0x00000000c2d00000000000008f4f0000
+
+ /* x^53312 mod p(x), x^53248 mod p(x) */
+ .octa 0x00000000373200000000000028690000
+
+ /* x^52288 mod p(x), x^52224 mod p(x) */
+ .octa 0x0000000046ae000000000000c3b30000
+
+ /* x^51264 mod p(x), x^51200 mod p(x) */
+ .octa 0x00000000b243000000000000f8700000
+
+ /* x^50240 mod p(x), x^50176 mod p(x) */
+ .octa 0x00000000f7f500000000000029eb0000
+
+ /* x^49216 mod p(x), x^49152 mod p(x) */
+ .octa 0x000000000c7e000000000000fe730000
+
+ /* x^48192 mod p(x), x^48128 mod p(x) */
+ .octa 0x00000000c38200000000000096000000
+
+ /* x^47168 mod p(x), x^47104 mod p(x) */
+ .octa 0x000000008956000000000000683c0000
+
+ /* x^46144 mod p(x), x^46080 mod p(x) */
+ .octa 0x00000000422d0000000000005f1e0000
+
+ /* x^45120 mod p(x), x^45056 mod p(x) */
+ .octa 0x00000000ac0f0000000000006f810000
+
+ /* x^44096 mod p(x), x^44032 mod p(x) */
+ .octa 0x00000000ce30000000000000031f0000
+
+ /* x^43072 mod p(x), x^43008 mod p(x) */
+ .octa 0x000000003d43000000000000455a0000
+
+ /* x^42048 mod p(x), x^41984 mod p(x) */
+ .octa 0x000000007ebe000000000000a6050000
+
+ /* x^41024 mod p(x), x^40960 mod p(x) */
+ .octa 0x00000000976e00000000000077eb0000
+
+ /* x^40000 mod p(x), x^39936 mod p(x) */
+ .octa 0x000000000872000000000000389c0000
+
+ /* x^38976 mod p(x), x^38912 mod p(x) */
+ .octa 0x000000008979000000000000c7b20000
+
+ /* x^37952 mod p(x), x^37888 mod p(x) */
+ .octa 0x000000005c1e0000000000001d870000
+
+ /* x^36928 mod p(x), x^36864 mod p(x) */
+ .octa 0x00000000aebb00000000000045810000
+
+ /* x^35904 mod p(x), x^35840 mod p(x) */
+ .octa 0x000000004f7e0000000000006d4a0000
+
+ /* x^34880 mod p(x), x^34816 mod p(x) */
+ .octa 0x00000000ea98000000000000b9200000
+
+ /* x^33856 mod p(x), x^33792 mod p(x) */
+ .octa 0x00000000f39600000000000022f20000
+
+ /* x^32832 mod p(x), x^32768 mod p(x) */
+ .octa 0x000000000bc500000000000041ca0000
+
+ /* x^31808 mod p(x), x^31744 mod p(x) */
+ .octa 0x00000000786400000000000078500000
+
+ /* x^30784 mod p(x), x^30720 mod p(x) */
+ .octa 0x00000000be970000000000009e7e0000
+
+ /* x^29760 mod p(x), x^29696 mod p(x) */
+ .octa 0x00000000dd6d000000000000a53c0000
+
+ /* x^28736 mod p(x), x^28672 mod p(x) */
+ .octa 0x000000004c3f00000000000039340000
+
+ /* x^27712 mod p(x), x^27648 mod p(x) */
+ .octa 0x0000000093a4000000000000b58e0000
+
+ /* x^26688 mod p(x), x^26624 mod p(x) */
+ .octa 0x0000000050fb00000000000062d40000
+
+ /* x^25664 mod p(x), x^25600 mod p(x) */
+ .octa 0x00000000f505000000000000a26f0000
+
+ /* x^24640 mod p(x), x^24576 mod p(x) */
+ .octa 0x0000000064f900000000000065e60000
+
+ /* x^23616 mod p(x), x^23552 mod p(x) */
+ .octa 0x00000000e8c2000000000000aad90000
+
+ /* x^22592 mod p(x), x^22528 mod p(x) */
+ .octa 0x00000000720b000000000000a3b00000
+
+ /* x^21568 mod p(x), x^21504 mod p(x) */
+ .octa 0x00000000e992000000000000d2680000
+
+ /* x^20544 mod p(x), x^20480 mod p(x) */
+ .octa 0x000000009132000000000000cf4c0000
+
+ /* x^19520 mod p(x), x^19456 mod p(x) */
+ .octa 0x00000000608a00000000000076610000
+
+ /* x^18496 mod p(x), x^18432 mod p(x) */
+ .octa 0x000000009948000000000000fb9f0000
+
+ /* x^17472 mod p(x), x^17408 mod p(x) */
+ .octa 0x00000000173000000000000003770000
+
+ /* x^16448 mod p(x), x^16384 mod p(x) */
+ .octa 0x000000006fe300000000000004880000
+
+ /* x^15424 mod p(x), x^15360 mod p(x) */
+ .octa 0x00000000e15300000000000056a70000
+
+ /* x^14400 mod p(x), x^14336 mod p(x) */
+ .octa 0x0000000092d60000000000009dfd0000
+
+ /* x^13376 mod p(x), x^13312 mod p(x) */
+ .octa 0x0000000002fd00000000000074c80000
+
+ /* x^12352 mod p(x), x^12288 mod p(x) */
+ .octa 0x00000000c78b000000000000a3ec0000
+
+ /* x^11328 mod p(x), x^11264 mod p(x) */
+ .octa 0x000000009262000000000000b3530000
+
+ /* x^10304 mod p(x), x^10240 mod p(x) */
+ .octa 0x0000000084f200000000000047bf0000
+
+ /* x^9280 mod p(x), x^9216 mod p(x) */
+ .octa 0x0000000067ee000000000000e97c0000
+
+ /* x^8256 mod p(x), x^8192 mod p(x) */
+ .octa 0x00000000535b00000000000091e10000
+
+ /* x^7232 mod p(x), x^7168 mod p(x) */
+ .octa 0x000000007ebb00000000000055060000
+
+ /* x^6208 mod p(x), x^6144 mod p(x) */
+ .octa 0x00000000c6a1000000000000fd360000
+
+ /* x^5184 mod p(x), x^5120 mod p(x) */
+ .octa 0x000000001be500000000000055860000
+
+ /* x^4160 mod p(x), x^4096 mod p(x) */
+ .octa 0x00000000ae0e0000000000005bd00000
+
+ /* x^3136 mod p(x), x^3072 mod p(x) */
+ .octa 0x0000000022040000000000008db20000
+
+ /* x^2112 mod p(x), x^2048 mod p(x) */
+ .octa 0x00000000c9eb000000000000efe20000
+
+ /* x^1088 mod p(x), x^1024 mod p(x) */
+ .octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+ .octa 0xefe20000dccf00009440000033590000
+
+ /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+ .octa 0xee6300002f3f000062180000e0ed0000
+
+ /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+ .octa 0xcf5f000017ef0000ccbe000023d30000
+
+ /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+ .octa 0x6d0c0000a30e00000920000042630000
+
+ /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+ .octa 0x21d30000932b0000a7a00000efcc0000
+
+ /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+ .octa 0x10be00000b310000666f00000d1c0000
+
+ /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+ .octa 0x1f240000ce9e0000caad0000589e0000
+
+ /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+ .octa 0x29610000d02b000039b400007cf50000
+
+ /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+ .octa 0x51d100009d9d00003c0e0000bfd60000
+
+ /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+ .octa 0xda390000ceae000013830000713c0000
+
+ /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+ .octa 0xb67800001e16000085c0000080a60000
+
+ /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+ .octa 0x0db40000f7f90000371d0000e6580000
+
+ /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+ .octa 0x87e70000044c0000aadb0000a4970000
+
+ /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+ .octa 0x1f990000ad180000d8b30000e7b50000
+
+ /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+ .octa 0xbe6c00006ee300004c1a000006df0000
+
+ /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+ .octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
+ /* Barrett constant n */
+ .octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
new file mode 100644
index 000000000000..3f9e1bbd9905
--- /dev/null
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_P10
+ tristate
+ depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ default CRYPTO_LIB_CHACHA
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_P10
+ tristate
+ depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ depends on BROKEN # Needs to be fixed to work in softirq context
+ default CRYPTO_LIB_POLY1305
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ select CRYPTO_LIB_POLY1305_GENERIC
+
+config CRYPTO_SHA256_PPC_SPE
+ tristate
+ depends on SPE
+ default CRYPTO_LIB_SHA256
+ select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile
new file mode 100644
index 000000000000..27f231f8e334
--- /dev/null
+++ b/arch/powerpc/lib/crypto/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+
+obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
+sha256-ppc-spe-y := sha256.o sha256-spe-asm.o
diff --git a/arch/powerpc/lib/crypto/chacha-p10-glue.c b/arch/powerpc/lib/crypto/chacha-p10-glue.c
new file mode 100644
index 000000000000..fcd23c6f1590
--- /dev/null
+++ b/arch/powerpc/lib/crypto/chacha-p10-glue.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ unsigned int l = bytes & ~0x0FF;
+
+ if (l > 0) {
+ chacha_p10le_8x(state, dst, src, l, nrounds);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += l / CHACHA_BLOCK_SIZE;
+ }
+
+ if (bytes > 0)
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ vsx_begin();
+ chacha_p10_do_8x(state, dst, src, todo, nrounds);
+ vsx_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+ return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_p10_init(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+ return 0;
+}
+subsys_initcall(chacha_p10_init);
+
+static void __exit chacha_p10_exit(void)
+{
+}
+module_exit(chacha_p10_exit);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/powerpc/lib/crypto/chacha-p10le-8x.S b/arch/powerpc/lib/crypto/chacha-p10le-8x.S
new file mode 100644
index 000000000000..b29562bd5d40
--- /dev/null
+++ b/arch/powerpc/lib/crypto/chacha-p10le-8x.S
@@ -0,0 +1,840 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# do rounds, 8 quarter rounds
+# 1. a += b; d ^= a; d <<<= 16;
+# 2. c += d; b ^= c; b <<<= 12;
+# 3. a += b; d ^= a; d <<<= 8;
+# 4. c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 4, 4, 25 #
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 20, 20, 25 #
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ xxlor 32+25, 0, 0
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+ vrlw 4, 4, 28 #
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 20, 20, 28 #
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ xxlor 32+28, 0, 0
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+
+ xxlor 32+25, 0, 0
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 4, 4, 25
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ vrlw 20, 20, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 4, 4, 28
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ vrlw 20, 20, 28
+ xxlor 32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 20
+ vpermxor 13, 13, 1, 20
+ vpermxor 14, 14, 2, 20
+ vpermxor 15, 15, 3, 20
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 21
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 22
+ vpermxor 13, 13, 1, 22
+ vpermxor 14, 14, 2, 22
+ vpermxor 15, 15, 3, 22
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 23
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 20
+ vpermxor 12, 12, 1, 20
+ vpermxor 13, 13, 2, 20
+ vpermxor 14, 14, 3, 20
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vrlw 4, 4, 21
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 22
+ vpermxor 12, 12, 1, 22
+ vpermxor 13, 13, 2, 22
+ vpermxor 14, 14, 3, 22
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+ vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
+ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
+ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
+ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
+ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
+ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
+ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
+ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+ vadduwm \S+0, \S+0, 16-\S
+ vadduwm \S+4, \S+4, 17-\S
+ vadduwm \S+8, \S+8, 18-\S
+ vadduwm \S+12, \S+12, 19-\S
+
+ vadduwm \S+1, \S+1, 16-\S
+ vadduwm \S+5, \S+5, 17-\S
+ vadduwm \S+9, \S+9, 18-\S
+ vadduwm \S+13, \S+13, 19-\S
+
+ vadduwm \S+2, \S+2, 16-\S
+ vadduwm \S+6, \S+6, 17-\S
+ vadduwm \S+10, \S+10, 18-\S
+ vadduwm \S+14, \S+14, 19-\S
+
+ vadduwm \S+3, \S+3, 16-\S
+ vadduwm \S+7, \S+7, 17-\S
+ vadduwm \S+11, \S+11, 18-\S
+ vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+ add 9, 14, 5
+ add 16, 14, 4
+ lxvw4x 0, 0, 9
+ lxvw4x 1, 17, 9
+ lxvw4x 2, 18, 9
+ lxvw4x 3, 19, 9
+ lxvw4x 4, 20, 9
+ lxvw4x 5, 21, 9
+ lxvw4x 6, 22, 9
+ lxvw4x 7, 23, 9
+ lxvw4x 8, 24, 9
+ lxvw4x 9, 25, 9
+ lxvw4x 10, 26, 9
+ lxvw4x 11, 27, 9
+ lxvw4x 12, 28, 9
+ lxvw4x 13, 29, 9
+ lxvw4x 14, 30, 9
+ lxvw4x 15, 31, 9
+
+ xxlxor \S+32, \S+32, 0
+ xxlxor \S+36, \S+36, 1
+ xxlxor \S+40, \S+40, 2
+ xxlxor \S+44, \S+44, 3
+ xxlxor \S+33, \S+33, 4
+ xxlxor \S+37, \S+37, 5
+ xxlxor \S+41, \S+41, 6
+ xxlxor \S+45, \S+45, 7
+ xxlxor \S+34, \S+34, 8
+ xxlxor \S+38, \S+38, 9
+ xxlxor \S+42, \S+42, 10
+ xxlxor \S+46, \S+46, 11
+ xxlxor \S+35, \S+35, 12
+ xxlxor \S+39, \S+39, 13
+ xxlxor \S+43, \S+43, 14
+ xxlxor \S+47, \S+47, 15
+
+ stxvw4x \S+32, 0, 16
+ stxvw4x \S+36, 17, 16
+ stxvw4x \S+40, 18, 16
+ stxvw4x \S+44, 19, 16
+
+ stxvw4x \S+33, 20, 16
+ stxvw4x \S+37, 21, 16
+ stxvw4x \S+41, 22, 16
+ stxvw4x \S+45, 23, 16
+
+ stxvw4x \S+34, 24, 16
+ stxvw4x \S+38, 25, 16
+ stxvw4x \S+42, 26, 16
+ stxvw4x \S+46, 27, 16
+
+ stxvw4x \S+35, 28, 16
+ stxvw4x \S+39, 29, 16
+ stxvw4x \S+43, 30, 16
+ stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+# unsigned int len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+ cmpdi 6, 0
+ ble Out_no_chacha
+
+ SAVE_REGS
+
+ # r17 - r31 mainly for Write_256 macro.
+ li 17, 16
+ li 18, 32
+ li 19, 48
+ li 20, 64
+ li 21, 80
+ li 22, 96
+ li 23, 112
+ li 24, 128
+ li 25, 144
+ li 26, 160
+ li 27, 176
+ li 28, 192
+ li 29, 208
+ li 30, 224
+ li 31, 240
+
+ mr 15, 6 # len
+ li 14, 0 # offset to inp and outp
+
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ # create (0, 1, 2, 3) counters
+ vspltisw 0, 0
+ vspltisw 1, 1
+ vspltisw 2, 2
+ vspltisw 3, 3
+ vmrghw 4, 0, 1
+ vmrglw 5, 2, 3
+ vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+
+ mtctr 8
+
+ # save constants to vsx
+ xxlor 16, 48, 48
+ xxlor 17, 49, 49
+ xxlor 18, 50, 50
+ xxlor 19, 51, 51
+
+ vspltisw 25, 4
+ vspltisw 26, 8
+
+ xxlor 25, 32+26, 32+26
+ xxlor 24, 32+25, 32+25
+
+ vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ xxlor 20, 32+20, 32+20
+ xxlor 21, 32+21, 32+21
+ xxlor 22, 32+22, 32+22
+ xxlor 23, 32+23, 32+23
+
+ cmpdi 6, 512
+ blt Loop_last
+
+Loop_8x:
+ xxspltw 32+0, 16, 0
+ xxspltw 32+1, 16, 1
+ xxspltw 32+2, 16, 2
+ xxspltw 32+3, 16, 3
+
+ xxspltw 32+4, 17, 0
+ xxspltw 32+5, 17, 1
+ xxspltw 32+6, 17, 2
+ xxspltw 32+7, 17, 3
+ xxspltw 32+8, 18, 0
+ xxspltw 32+9, 18, 1
+ xxspltw 32+10, 18, 2
+ xxspltw 32+11, 18, 3
+ xxspltw 32+12, 19, 0
+ xxspltw 32+13, 19, 1
+ xxspltw 32+14, 19, 2
+ xxspltw 32+15, 19, 3
+ vadduwm 12, 12, 30 # increase counter
+
+ xxspltw 32+16, 16, 0
+ xxspltw 32+17, 16, 1
+ xxspltw 32+18, 16, 2
+ xxspltw 32+19, 16, 3
+
+ xxspltw 32+20, 17, 0
+ xxspltw 32+21, 17, 1
+ xxspltw 32+22, 17, 2
+ xxspltw 32+23, 17, 3
+ xxspltw 32+24, 18, 0
+ xxspltw 32+25, 18, 1
+ xxspltw 32+26, 18, 2
+ xxspltw 32+27, 18, 3
+ xxspltw 32+28, 19, 0
+ xxspltw 32+29, 19, 1
+ vadduwm 28, 28, 31 # increase counter
+ xxspltw 32+30, 19, 2
+ xxspltw 32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+ QT_loop_8x
+
+ bdnz quarter_loop_8x
+
+ xxlor 0, 32+30, 32+30
+ xxlor 32+30, 30, 30
+ vadduwm 12, 12, 30
+ xxlor 32+30, 0, 0
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ xxlor 0, 48, 48
+ xxlor 1, 49, 49
+ xxlor 2, 50, 50
+ xxlor 3, 51, 51
+ xxlor 48, 16, 16
+ xxlor 49, 17, 17
+ xxlor 50, 18, 18
+ xxlor 51, 19, 19
+ Add_state 0
+ xxlor 48, 0, 0
+ xxlor 49, 1, 1
+ xxlor 50, 2, 2
+ xxlor 51, 3, 3
+ Write_256 0
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len -=256
+
+ xxlor 5, 32+31, 32+31
+ xxlor 32+31, 31, 31
+ vadduwm 28, 28, 31
+ xxlor 32+31, 5, 5
+ TP_4x 16+0, 16+1, 16+2, 16+3
+ TP_4x 16+4, 16+5, 16+6, 16+7
+ TP_4x 16+8, 16+9, 16+10, 16+11
+ TP_4x 16+12, 16+13, 16+14, 16+15
+
+ xxlor 32, 16, 16
+ xxlor 33, 17, 17
+ xxlor 34, 18, 18
+ xxlor 35, 19, 19
+ Add_state 16
+ Write_256 16
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len +=256
+
+ xxlor 32+24, 24, 24
+ xxlor 32+25, 25, 25
+ xxlor 32+30, 30, 30
+ vadduwm 30, 30, 25
+ vadduwm 31, 30, 24
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ cmpdi 15, 0
+ beq Out_loop
+
+ cmpdi 15, 512
+ blt Loop_last
+
+ mtctr 8
+ b Loop_8x
+
+Loop_last:
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+ mtctr 8
+
+Loop_4x:
+ vspltw 0, 16, 0
+ vspltw 1, 16, 1
+ vspltw 2, 16, 2
+ vspltw 3, 16, 3
+
+ vspltw 4, 17, 0
+ vspltw 5, 17, 1
+ vspltw 6, 17, 2
+ vspltw 7, 17, 3
+ vspltw 8, 18, 0
+ vspltw 9, 18, 1
+ vspltw 10, 18, 2
+ vspltw 11, 18, 3
+ vspltw 12, 19, 0
+ vadduwm 12, 12, 30 # increase counter
+ vspltw 13, 19, 1
+ vspltw 14, 19, 2
+ vspltw 15, 19, 3
+
+.align 5
+quarter_loop:
+ QT_loop_4x
+
+ bdnz quarter_loop
+
+ vadduwm 12, 12, 30
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ Add_state 0
+ Write_256 0
+ addi 14, 14, 256 # offset += 256
+ addi 15, 15, -256 # len += 256
+
+ # Update state counter
+ vspltisw 25, 4
+ vadduwm 30, 30, 25
+
+ cmpdi 15, 0
+ beq Out_loop
+ cmpdi 15, 256
+ blt Out_loop
+
+ mtctr 8
+ b Loop_4x
+
+Out_loop:
+ RESTORE_REGS
+ blr
+
+Out_no_chacha:
+ li 3, 0
+ blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
new file mode 100644
index 000000000000..3f1664a724b6
--- /dev/null
+++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+void poly1305_block_init_arch(struct poly1305_block_state *dctx,
+ const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_block_init_generic(dctx, raw_key);
+
+ dctx->h = (struct poly1305_state){};
+ dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+ dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_blocks_generic(state, src, len, padbit);
+ vsx_begin();
+ if (len >= POLY1305_BLOCK_SIZE * 4) {
+ poly1305_p10le_4blocks(state, src, len);
+ src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+ len %= POLY1305_BLOCK_SIZE * 4;
+ }
+ while (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+ len -= POLY1305_BLOCK_SIZE;
+ src += POLY1305_BLOCK_SIZE;
+ }
+ vsx_end();
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+void poly1305_emit_arch(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_emit_generic(state, digest, nonce);
+ poly1305_emit_64(state, nonce, digest);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+ return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init poly1305_p10_init(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+ return 0;
+}
+subsys_initcall(poly1305_p10_init);
+
+static void __exit poly1305_p10_exit(void)
+{
+}
+module_exit(poly1305_p10_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_DESCRIPTION("Optimized Poly1305 for P10");
diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+# - 26 bits limbs
+# - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+# vs [r^1, r^3, r^2, r^4]
+# vs0 = [r0,.....]
+# vs1 = [r1,.....]
+# vs2 = [r2,.....]
+# vs3 = [r3,.....]
+# vs4 = [r4,.....]
+# vs5 = [r1*5,...]
+# vs6 = [r2*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r4*5,...]
+#
+# Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
+# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
+#
+# [r^2, r^3, r^1, r^4]
+# [m3, m2, m4, m1]
+#
+# multiply odd and even words
+.macro mul_odd
+ vmulouw 14, 4, 26
+ vmulouw 10, 5, 3
+ vmulouw 11, 6, 2
+ vmulouw 12, 7, 1
+ vmulouw 13, 8, 0
+ vmulouw 15, 4, 27
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vmulouw 10, 5, 26
+ vmulouw 11, 6, 3
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vmulouw 12, 7, 2
+ vmulouw 13, 8, 1
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+ vmulouw 16, 4, 28
+ vmulouw 10, 5, 27
+ vmulouw 11, 6, 26
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vmulouw 12, 7, 3
+ vmulouw 13, 8, 2
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+ vmulouw 17, 4, 29
+ vmulouw 10, 5, 28
+ vmulouw 11, 6, 27
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vmulouw 12, 7, 26
+ vmulouw 13, 8, 3
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+ vmulouw 18, 4, 30
+ vmulouw 10, 5, 29
+ vmulouw 11, 6, 28
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vmulouw 12, 7, 27
+ vmulouw 13, 8, 26
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+.macro mul_even
+ vmuleuw 9, 4, 26
+ vmuleuw 10, 5, 3
+ vmuleuw 11, 6, 2
+ vmuleuw 12, 7, 1
+ vmuleuw 13, 8, 0
+ vaddudm 14, 14, 9
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+
+ vmuleuw 9, 4, 27
+ vmuleuw 10, 5, 26
+ vmuleuw 11, 6, 3
+ vmuleuw 12, 7, 2
+ vmuleuw 13, 8, 1
+ vaddudm 15, 15, 9
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+
+ vmuleuw 9, 4, 28
+ vmuleuw 10, 5, 27
+ vmuleuw 11, 6, 26
+ vmuleuw 12, 7, 3
+ vmuleuw 13, 8, 2
+ vaddudm 16, 16, 9
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+
+ vmuleuw 9, 4, 29
+ vmuleuw 10, 5, 28
+ vmuleuw 11, 6, 27
+ vmuleuw 12, 7, 26
+ vmuleuw 13, 8, 3
+ vaddudm 17, 17, 9
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+
+ vmuleuw 9, 4, 30
+ vmuleuw 10, 5, 29
+ vmuleuw 11, 6, 28
+ vmuleuw 12, 7, 27
+ vmuleuw 13, 8, 26
+ vaddudm 18, 18, 9
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+# [r, r^3, r^2, r^4]
+# vs0 = [r0,...]
+# vs1 = [r1,...]
+# vs2 = [r2,...]
+# vs3 = [r3,...]
+# vs4 = [r4,...]
+# vs5 = [r4*5,...]
+# vs6 = [r3*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+.macro poly1305_setup_r
+
+ # save r
+ xxlor 26, 58, 58
+ xxlor 27, 59, 59
+ xxlor 28, 60, 60
+ xxlor 29, 61, 61
+ xxlor 30, 62, 62
+
+ xxlxor 31, 31, 31
+
+# [r, r^3, r^2, r^4]
+ # compute r^2
+ vmr 4, 26
+ vmr 5, 27
+ vmr 6, 28
+ vmr 7, 29
+ vmr 8, 30
+ bl do_mul # r^2 r^1
+ xxpermdi 58, 58, 36, 0x3 # r0
+ xxpermdi 59, 59, 37, 0x3 # r1
+ xxpermdi 60, 60, 38, 0x3 # r2
+ xxpermdi 61, 61, 39, 0x3 # r3
+ xxpermdi 62, 62, 40, 0x3 # r4
+ xxpermdi 36, 36, 36, 0x3
+ xxpermdi 37, 37, 37, 0x3
+ xxpermdi 38, 38, 38, 0x3
+ xxpermdi 39, 39, 39, 0x3
+ xxpermdi 40, 40, 40, 0x3
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ bl do_mul # r^4 r^3
+ vmrgow 26, 26, 4
+ vmrgow 27, 27, 5
+ vmrgow 28, 28, 6
+ vmrgow 29, 29, 7
+ vmrgow 30, 30, 8
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ # r^2 r^4
+ xxlor 0, 58, 58
+ xxlor 1, 59, 59
+ xxlor 2, 60, 60
+ xxlor 3, 61, 61
+ xxlor 4, 62, 62
+ xxlor 5, 32, 32
+ xxlor 6, 33, 33
+ xxlor 7, 34, 34
+ xxlor 8, 35, 35
+
+ vspltw 9, 26, 3
+ vspltw 10, 26, 2
+ vmrgow 26, 10, 9
+ vspltw 9, 27, 3
+ vspltw 10, 27, 2
+ vmrgow 27, 10, 9
+ vspltw 9, 28, 3
+ vspltw 10, 28, 2
+ vmrgow 28, 10, 9
+ vspltw 9, 29, 3
+ vspltw 10, 29, 2
+ vmrgow 29, 10, 9
+ vspltw 9, 30, 3
+ vspltw 10, 30, 2
+ vmrgow 30, 10, 9
+
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+ mul_odd
+
+ # do reduction ( h %= p )
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+ blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ li 14, 16
+ li 15, 32
+ addis 10, 2, cnum@toc@ha
+ addi 10, 10, cnum@toc@l
+ lvx 25, 0, 10 # v25 - mask
+ lvx 31, 14, 10 # v31 = 1a
+ lvx 19, 15, 10 # v19 = 1 << 24
+ lxv 24, 48(10) # vs24
+ lxv 25, 64(10) # vs25
+
+ # initialize
+ # load key from r3 to vectors
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11
+ and. 10, 10, 12
+
+ # break 26 bits
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 58, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 59, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 60, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 61, 0, 17
+ mtvsrdd 62, 0, 18
+
+ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+ li 9, 5
+ mtvsrdd 36, 0, 9
+ vmulouw 0, 27, 4 # v0 = rr0
+ vmulouw 1, 28, 4 # v1 = rr1
+ vmulouw 2, 29, 4 # v2 = rr2
+ vmulouw 3, 30, 4 # v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+ cmpdi 5, 64
+ blt Out_no_poly1305
+
+ SAVE_REGS
+
+ do_poly1305_init
+
+ li 21, 0 # counter to message
+
+ poly1305_setup_r
+
+ # load previous H state
+ # break/convert r6 to 26 bits
+ ld 9, 0(3)
+ ld 10, 8(3)
+ ld 19, 16(3)
+ sldi 19, 19, 24
+ mtvsrdd 41, 0, 19
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 36, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 37, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 38, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 39, 0, 17
+ mtvsrdd 40, 0, 18
+ vor 8, 8, 9
+
+ # input m1 m2
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 20, 4, 9
+ vaddudm 21, 5, 10
+ vaddudm 22, 6, 11
+ vaddudm 23, 7, 12
+ vaddudm 24, 8, 13
+
+ # m3 m4
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vspltisb 13, 14
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 20
+ vmrgow 5, 10, 21
+ vmrgow 6, 11, 22
+ vmrgow 7, 12, 23
+ vmrgow 8, 13, 24
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ li 9, 64
+ divdu 31, 5, 9
+
+ cmpdi 31, 0
+ ble Skip_block_loop
+
+ mtctr 31
+
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+# .... Repeat
+# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
+# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+
+ # input m1 m2 m3 m4
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 17, 11, 12, 17
+ vperm 18, 11, 12, 18
+
+ vand 20, 14, 25 # a0
+ vand 9, 17, 25 # a0
+ vsrd 21, 14, 31 # >> 26
+ vsrd 22, 21, 31 # 12 bits left
+ vsrd 10, 17, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+
+ vand 21, 21, 25 # a1
+ vand 10, 10, 25 # a1
+
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 23, 16, 13
+ vor 22, 22, 23
+ vand 22, 22, 25 # a2
+ vand 16, 18, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 23, 15, 13 # >> 14
+ vsrd 24, 23, 31 # >> 26, a4
+ vand 23, 23, 25 # a3
+ vsrd 12, 18, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 4, 4, 20
+ vaddudm 5, 5, 21
+ vaddudm 6, 6, 22
+ vaddudm 7, 7, 23
+ vaddudm 8, 8, 24
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 4
+ vmrgow 5, 10, 5
+ vmrgow 6, 11, 6
+ vmrgow 7, 12, 7
+ vmrgow 8, 13, 8
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ bdnz loop_4blocks
+
+Skip_block_loop:
+ xxlor 58, 0, 0
+ xxlor 59, 1, 1
+ xxlor 60, 2, 2
+ xxlor 61, 3, 3
+ xxlor 62, 4, 4
+ xxlor 32, 5, 5
+ xxlor 33, 6, 6
+ xxlor 34, 7, 7
+ xxlor 35, 8, 8
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+
+ # Sum the products.
+ xxpermdi 41, 31, 46, 0
+ xxpermdi 42, 31, 47, 0
+ vaddudm 4, 14, 9
+ xxpermdi 36, 31, 36, 3
+ vaddudm 5, 15, 10
+ xxpermdi 37, 31, 37, 3
+ xxpermdi 43, 31, 48, 0
+ vaddudm 6, 16, 11
+ xxpermdi 38, 31, 38, 3
+ xxpermdi 44, 31, 49, 0
+ vaddudm 7, 17, 12
+ xxpermdi 39, 31, 39, 3
+ xxpermdi 45, 31, 50, 0
+ vaddudm 8, 18, 13
+ xxpermdi 40, 31, 40, 3
+
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 4, 31
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 8, 8, 11
+ vsrd 12, 8, 31
+ vaddudm 5, 5, 10
+
+ vsrd 11, 5, 31
+ vand 8, 8, 25
+ vand 5, 5, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 6, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vsrd 10, 5, 31
+ vand 5, 5, 25
+ vaddudm 6, 6, 10
+ vaddudm 8, 8, 11
+
+ b do_final_update
+
+do_final_update:
+ # combine 26 bit limbs
+ # v4, v5, v6, v7 and v8 are 26 bit vectors
+ vsld 5, 5, 31
+ vor 20, 4, 5
+ vspltisb 11, 12
+ vsrd 12, 6, 11
+ vsld 6, 6, 31
+ vsld 6, 6, 31
+ vor 20, 20, 6
+ vspltisb 11, 14
+ vsld 7, 7, 11
+ vor 21, 7, 12
+ mfvsrld 16, 40 # save last 2 bytes
+ vsld 8, 8, 11
+ vsld 8, 8, 31
+ vor 21, 21, 8
+ mfvsrld 17, 52
+ mfvsrld 19, 53
+ srdi 16, 16, 24
+
+ std 17, 0(3)
+ std 19, 8(3)
+ stw 16, 16(3)
+
+Out_loop:
+ li 3, 0
+
+ RESTORE_REGS
+
+ blr
+
+Out_no_poly1305:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+ # mask 0x0FFFFFFC0FFFFFFC
+ # mask 0x0FFFFFFC0FFFFFFF
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ # initialize
+ # load key from r3
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11 # cramp mask r0
+ and. 10, 10, 12 # cramp mask r1
+
+ srdi 21, 10, 2
+ add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
+
+ # setup r and s
+ li 25, 0
+ mtvsrdd 32+0, 9, 19 # r0, s1
+ mtvsrdd 32+1, 10, 9 # r1, r0
+ mtvsrdd 32+2, 19, 25 # s1
+ mtvsrdd 32+3, 9, 25 # r0
+
+ blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+ #
+ # d0 = h0 * r0 + h1 * s1
+ vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
+
+ # d1 = h0 * r1 + h1 * r0 + h2 * s1
+ vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
+ vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
+
+ # d2 = r0
+ vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
+ blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+ mfvsrld 27, 32+7
+ mfvsrld 28, 32+10
+ mfvsrld 29, 32+11
+ mfvsrd 20, 32+7 # h0.h
+ mfvsrd 21, 32+10 # h1.h
+
+ addc 28, 28, 20
+ adde 29, 29, 21
+ srdi 22, 29, 0x2
+ sldi 23, 22, 0x2
+ add 23, 23, 22 # (h2 & 3) * 5
+ addc 27, 27, 23 # h0
+ addze 28, 28 # h1
+ andi. 29, 29, 0x3 # h2
+ blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+# d0 = h0 * r0 + h1 * s1
+# d1 = h0 * r1 + h1 * r0 + h2 * s1
+# d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+# - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+ cmpdi 5, 0
+ ble Out_no_poly1305_64
+
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-400(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ # Init poly1305
+ bl Poly1305_init_64
+
+ li 25, 0 # offset to inp and outp
+
+ add 11, 25, 4
+
+ # load h
+ # h0, h1, h2?
+ ld 27, 0(3)
+ ld 28, 8(3)
+ lwz 29, 16(3)
+
+ li 30, 16
+ divdu 31, 5, 30
+
+ mtctr 31
+
+ mr 24, 6 # highbit
+
+Loop_block_64:
+ vxor 9, 9, 9
+
+ ld 20, 0(11)
+ ld 21, 8(11)
+ addi 11, 11, 16
+
+ addc 27, 27, 20
+ adde 28, 28, 21
+ adde 29, 29, 24
+
+ li 22, 0
+ mtvsrdd 32+6, 27, 28 # h0, h1
+ mtvsrdd 32+8, 29, 22 # h2
+
+ bl Poly1305_mult
+
+ bl Carry_reduction
+
+ bdnz Loop_block_64
+
+ std 27, 0(3)
+ std 28, 8(3)
+ stw 29, 16(3)
+
+ li 3, 0
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 400
+ ld 0, 16(1)
+ mtlr 0
+
+ blr
+
+Out_no_poly1305_64:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+ ld 10, 0(3)
+ ld 11, 8(3)
+ ld 12, 16(3)
+
+ # compare modulus
+ # h + 5 + (-p)
+ mr 6, 10
+ mr 7, 11
+ mr 8, 12
+ addic. 6, 6, 5
+ addze 7, 7
+ addze 8, 8
+ srdi 9, 8, 2 # overflow?
+ cmpdi 9, 0
+ beq Skip_h64
+ mr 10, 6
+ mr 11, 7
+ mr 12, 8
+
+Skip_h64:
+ ld 6, 0(4)
+ ld 7, 8(4)
+ addc 10, 10, 6
+ adde 11, 11, 7
+ addze 12, 12
+
+ std 10, 0(5)
+ std 11, 8(5)
+ blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long 0x1a, 0x00, 0x1a, 0x00
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/arch/powerpc/lib/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/arch/powerpc/lib/crypto/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash values in memory */
+#define rKP r24 /* pointer to round constants */
+#define rWP r4 /* pointer to input data */
+
+#define rH0 r5 /* 8 32 bit hash values in 8 registers */
+#define rH1 r6
+#define rH2 r7
+#define rH3 r8
+#define rH4 r9
+#define rH5 r10
+#define rH6 r11
+#define rH7 r12
+
+#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rT0 r22 /* 64 bit temporaries */
+#define rT1 r23
+#define rT2 r0 /* 32 bit temporaries */
+#define rT3 r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+ cmpwi rT1,0;
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1); \
+ stw r24,88(r1); /* save normal registers */ \
+ stw r25,92(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ lwz r24,88(r1); /* restore normal registers */ \
+ lwz r25,92(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* was already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+ LOAD_DATA(w, off) /* 1: W */ \
+ rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
+ rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
+ rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
+ and rT3,e,f; /* 1: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
+ andc rT1,g,e; /* 1: ch' = ~e and g */ \
+ lwz rT2,off(rKP); /* 1: K */ \
+ xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
+ add h,h,rT0; /* 1: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 1: temp1' = ch + w */ \
+ rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
+ add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + K */ \
+ rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
+ evmergelo w,w,w; /* shift W */ \
+ or rT2,a,b; /* 1: maj = a or b */ \
+ and rT1,a,b; /* 1: maj' = a and b */ \
+ and rT2,rT2,c; /* 1: maj = maj and c */ \
+ LOAD_DATA(w, off+4) /* 2: W */ \
+ or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ add h,h,rT3; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ lwz rT2,off+4(rKP); /* 2: K */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 2: temp1' = ch + w */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ add g,g,rT2; /* 2: temp1 = temp1 + K */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+ rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
+ evmergelohi rT0,w0,w1; /* w[-15] */ \
+ rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
+ evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
+ rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
+ evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
+ add h,h,rT2; /* 1: temp1 = h + S1 */ \
+ evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
+ and rT2,e,f; /* 1: ch = e and f */ \
+ evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
+ andc rT3,g,e; /* 1: ch' = ~e and g */ \
+ evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
+ xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
+ evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
+ evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
+ rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evldw rT1,off(rKP); /* k */ \
+ rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
+ evaddw w0,w0,rT0; /* w = w + s1 */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evmergelohi rT0,w4,w5; /* w[-7] */ \
+ and rT3,a,b; /* 1: maj = a and b */ \
+ evaddw w0,w0,rT0; /* w = w + w[-7] */ \
+ CMP_K##k##_LOOP \
+ add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
+ evaddw rT1,rT1,w0; /* wk = w + k */ \
+ xor rT3,a,b; /* 1: maj = a xor b */ \
+ evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
+ and rT3,rT3,c; /* 1: maj = maj and c */ \
+ add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
+ add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
+ add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add h,h,rT2; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+_GLOBAL(ppc_spe_sha256_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+ lwz rH4,16(rHP)
+ lwz rH5,20(rHP)
+ lwz rH6,24(rHP)
+ lwz rH7,28(rHP)
+
+ppc_spe_sha256_main:
+ lis rKP,PPC_SPE_SHA256_K@ha
+ addi rKP,rKP,PPC_SPE_SHA256_K@l
+
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+ addi rKP,rKP,64
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW0, rW1, rW4, rW5, rW7, N, 0)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW1, rW2, rW5, rW6, rW0, N, 8)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW2, rW3, rW6, rW7, rW1, N, 16)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW3, rW4, rW7, rW0, rW2, N, 24)
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW4, rW5, rW0, rW1, rW3, N, 32)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW5, rW6, rW1, rW2, rW4, N, 40)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW6, rW7, rW2, rW3, rW5, N, 48)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW7, rW0, rW3, rW4, rW6, C, 56)
+ bt gt,ppc_spe_sha256_16_rounds
+
+ lwz rW0,0(rHP)
+ NEXT_BLOCK
+ lwz rW1,4(rHP)
+ lwz rW2,8(rHP)
+ lwz rW3,12(rHP)
+ lwz rW4,16(rHP)
+ lwz rW5,20(rHP)
+ lwz rW6,24(rHP)
+ lwz rW7,28(rHP)
+
+ add rH0,rH0,rW0
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+ add rH5,rH5,rW5
+ stw rH5,20(rHP)
+ add rH6,rH6,rW6
+ stw rH6,24(rHP)
+ add rH7,rH7,rW7
+ stw rH7,28(rHP)
+
+ bdnz ppc_spe_sha256_main
+
+ FINALIZE
+ blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c
new file mode 100644
index 000000000000..6b0f079587eb
--- /dev/null
+++ b/arch/powerpc/lib/crypto/sha256.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks)
+{
+ do {
+ /* cut input data into smaller blocks */
+ u32 unit = min_t(size_t, nblocks,
+ MAX_BYTES / SHA256_BLOCK_SIZE);
+
+ spe_begin();
+ ppc_spe_sha256_transform(state, data, unit);
+ spe_end();
+
+ data += unit * SHA256_BLOCK_SIZE;
+ nblocks -= unit;
+ } while (nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index d491da8d1838..54340912398f 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -45,7 +45,7 @@ int exit_vmx_usercopy(void)
* set and we are preemptible. The hack here is to schedule a
* decrementer to fire here and reschedule for us if necessary.
*/
- if (IS_ENABLED(CONFIG_PREEMPT) && need_resched())
+ if (need_irq_preemption() && need_resched())
set_dec(1);
return 0;
}