Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.12: API: - Add batch registration for acomp/scomp - Change acomp testing to non-unique compressed result - Extend algorithm name limit to 128 bytes - Require setkey before accept(2) in algif_aead Algorithms: - Add support for deflate rfc1950 (zlib) Drivers: - Add accelerated crct10dif for powerpc - Add crc32 in stm32 - Add sha384/sha512 in ccp - Add 3des/gcm(aes) for v5 devices in ccp - Add Queue Interface (QI) backend support in caam - Add new Exynos RNG driver - Add ThunderX ZIP driver - Add driver for hardware random generator on MT7623 SoC" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (101 commits) crypto: stm32 - Fix OF module alias information crypto: algif_aead - Require setkey before accept(2) crypto: scomp - add support for deflate rfc1950 (zlib) crypto: scomp - allow registration of multiple scomps crypto: ccp - Change ISR handler method for a v5 CCP crypto: ccp - Change ISR handler method for a v3 CCP crypto: crypto4xx - rename ce_ring_contol to ce_ring_control crypto: testmgr - Allow ecb(cipher_null) in FIPS mode Revert "crypto: arm64/sha - Add constant operand modifier to ASM_EXPORT" crypto: ccp - Disable interrupts early on unload crypto: ccp - Use only the relevant interrupt bits hwrng: mtk - Add driver for hardware random generator on MT7623 SoC dt-bindings: hwrng: Add Mediatek hardware random generator bindings crypto: crct10dif-vpmsum - Fix missing preempt_disable() crypto: testmgr - replace compression known answer test crypto: acomp - allow registration of multiple acomps hwrng: n2 - Use devm_kcalloc() in n2rng_probe() crypto: chcr - Fix error handling related to 'chcr_alloc_shash' padata: get_next is never NULL crypto: exynos - Add new Exynos RNG driver ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-03 01:53:46 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-03 01:53:46 +0300
commit: 5a0387a8a8efb90ae7fea1e2e5c62de3efa74691 (patch)
tree: 9e5bbbafe7fea01c843d86c7c3d40f29f962c474 /arch
parent: 204f144c9fcac355843412b6ba1150086488a208 (diff)
parent: 929562b144783b9212625305eadcbbd800809643 (diff)
download: linux-5a0387a8a8efb90ae7fea1e2e5c62de3efa74691.tar.xz
20 files changed, 1952 insertions, 752 deletions
diff --git a/arch/arm/boot/dts/stm32746g-eval.dts b/arch/arm/boot/dts/stm32746g-eval.dts
index aa03fac1ec55..0dc18a0f0940 100644
--- a/arch/arm/boot/dts/stm32746g-eval.dts
+++ b/arch/arm/boot/dts/stm32746g-eval.dts
@@ -89,6 +89,10 @@
 	clock-frequency = <25000000>;
 };
 
+&crc {
+	status = "okay";
+};
+
 &usart1 {
 	pinctrl-0 = <&usart1_pins_a>;
 	pinctrl-names = "default";
diff --git a/arch/arm/boot/dts/stm32f746.dtsi b/arch/arm/boot/dts/stm32f746.dtsi
index f321ffe87144..755fb923c07b 100644
--- a/arch/arm/boot/dts/stm32f746.dtsi
+++ b/arch/arm/boot/dts/stm32f746.dtsi
@@ -289,6 +289,13 @@
 			};
 		};
 
+		crc: crc@40023000 {
+			compatible = "st,stm32f7-crc";
+			reg = <0x40023000 0x400>;
+			clocks = <&rcc 0 12>;
+			status = "disabled";
+		};
+
 		rcc: rcc@40023800 {
 			#clock-cells = <2>;
 			compatible = "st,stm32f42xx-rcc", "st,stm32-rcc";
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index a9d8e3c9b487..03437f8f9ad1 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -75,5 +75,7 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_FTRACE is not set
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_DEV_STM32=y
 CONFIG_CRC_ITU_T=y
 CONFIG_CRC7=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index a8fce93137fb..b9adedcc5b2e 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -73,7 +73,7 @@ config CRYPTO_AES_ARM_BS
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
-	select CRYPTO_AES_ARM
+	select CRYPTO_AES
 	help
 	  Use a faster and more secure NEON based implementation of AES in CBC,
 	  CTR and XTS modes
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 2920b96dbd36..c76377961444 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -42,9 +42,6 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
 asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]);
 
-asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
-				  u8 out[]);
-
 struct aesbs_ctx {
 	int	rounds;
 	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
@@ -52,12 +49,12 @@ struct aesbs_ctx {
 
 struct aesbs_cbc_ctx {
 	struct aesbs_ctx	key;
-	u32			enc[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*enc_tfm;
 };
 
 struct aesbs_xts_ctx {
 	struct aesbs_ctx	key;
-	u32			twkey[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*tweak_tfm;
 };
 
 static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -132,20 +129,18 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 
 	ctx->key.rounds = 6 + key_len / 4;
 
-	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
-
 	kernel_neon_begin();
 	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	kernel_neon_end();
 
-	return 0;
+	return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
 }
 
 static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
 {
 	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
+	crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src);
 }
 
 static int cbc_encrypt(struct skcipher_request *req)
@@ -181,6 +176,23 @@ static int cbc_decrypt(struct skcipher_request *req)
 	return err;
 }
 
+static int cbc_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->enc_tfm))
+		return PTR_ERR(ctx->enc_tfm);
+	return 0;
+}
+
+static void cbc_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_cipher(ctx->enc_tfm);
+}
+
 static int ctr_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -228,7 +240,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
 	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct crypto_aes_ctx rk;
 	int err;
 
 	err = xts_verify_key(tfm, in_key, key_len);
@@ -236,15 +247,30 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 		return err;
 
 	key_len /= 2;
-	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len);
 	if (err)
 		return err;
 
-	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
-
 	return aesbs_setkey(tfm, in_key, key_len);
 }
 
+static int xts_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->tweak_tfm))
+		return PTR_ERR(ctx->tweak_tfm);
+	return 0;
+}
+
+static void xts_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_cipher(ctx->tweak_tfm);
+}
+
 static int __xts_crypt(struct skcipher_request *req,
 		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]))
@@ -256,7 +282,7 @@ static int __xts_crypt(struct skcipher_request *req,
 
 	err = skcipher_walk_virt(&walk, req, true);
 
-	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
+	crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
 
 	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
@@ -309,6 +335,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= cbc_init,
+	.base.cra_exit		= cbc_exit,
 
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -342,6 +370,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= xts_init,
+	.base.cra_exit		= xts_exit,
 
 	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
@@ -402,5 +432,5 @@ unregister_simds:
 	return err;
 }
 
-module_init(aes_init);
+late_initcall(aes_init);
 module_exit(aes_exit);
diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
index 5d995f7724af..620495a43363 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
@@ -380,7 +380,7 @@
 			#size-cells = <2>;
 			ranges = <0x0 0x0 0x0 0xc8834000 0x0 0x2000>;
 
-			rng {
+			hwrng: rng {
 				compatible = "amlogic,meson-rng";
 				reg = <0x0 0x0 0x0 0x4>;
 			};
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
index 04b3324bc132..a375cb21cc8b 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
@@ -524,3 +524,8 @@
 &vpu {
 	compatible = "amlogic,meson-gxbb-vpu", "amlogic,meson-gx-vpu";
 };
+
+&hwrng {
+	clocks = <&clkc CLKID_RNG0>;
+	clock-names = "core";
+};
diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
index 91ffc4b75c33..09d67b7f51ca 100644
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -31,8 +31,6 @@ static void tbi_boing_init(void)
 }
 #endif
 
-#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
-
 /*
  * Unwind the current stack frame and store the new register values in the
  * structure passed as argument. Unwinding is equivalent to a function return,
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 87f40454bad3..67eca3af9fc7 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
+obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -17,3 +19,4 @@ sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
+crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
new file mode 100644
index 000000000000..0153a9c6f4af
--- /dev/null
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
@@ -0,0 +1,137 @@
+/*
+ * CRC vpmsum tester
+ * Copyright 2017 Daniel Axtens, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+static unsigned long iterations = 10000;
+
+#define MAX_CRC_LENGTH 65535
+
+
+static int __init crc_test_init(void)
+{
+	u16 crc16 = 0, verify16 = 0;
+	u32 crc32 = 0, verify32 = 0;
+	__le32 verify32le = 0;
+	unsigned char *data;
+	unsigned long i;
+	int ret;
+
+	struct crypto_shash *crct10dif_tfm;
+	struct crypto_shash *crc32c_tfm;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
+
+	if (IS_ERR(crct10dif_tfm)) {
+		pr_err("Error allocating crc-t10dif\n");
+		goto free_buf;
+	}
+
+	crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
+
+	if (IS_ERR(crc32c_tfm)) {
+		pr_err("Error allocating crc32c\n");
+		goto free_16;
+	}
+
+	do {
+		SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
+		SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
+
+		crct10dif_shash->tfm = crct10dif_tfm;
+		ret = crypto_shash_init(crct10dif_shash);
+
+		if (ret) {
+			pr_err("Error initing crc-t10dif\n");
+			goto free_32;
+		}
+
+
+		crc32c_shash->tfm = crc32c_tfm;
+		ret = crypto_shash_init(crc32c_shash);
+
+		if (ret) {
+			pr_err("Error initing crc32c\n");
+			goto free_32;
+		}
+
+		pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
+		for (i=0; i<iterations; i++) {
+			size_t len, offset;
+
+			get_random_bytes(data, MAX_CRC_LENGTH);
+			get_random_bytes(&len, sizeof(len));
+			get_random_bytes(&offset, sizeof(offset));
+
+			len %= MAX_CRC_LENGTH;
+			offset &= 15;
+			if (len <= offset)
+				continue;
+			len -= offset;
+
+			crypto_shash_update(crct10dif_shash, data+offset, len);
+			crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
+			verify16 = crc_t10dif_generic(verify16, data+offset, len);
+
+
+			if (crc16 != verify16) {
+				pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
+				       crc16, verify16, len);
+				break;
+			}
+
+			crypto_shash_update(crc32c_shash, data+offset, len);
+			crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
+			verify32 = le32_to_cpu(verify32le);
+		        verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
+			if (crc32 != (u32)verify32le) {
+				pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
+				       crc32, verify32, len);
+				break;
+			}
+		}
+		pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
+	} while (0);
+
+free_32:
+	crypto_free_shash(crc32c_tfm);
+
+free_16:
+	crypto_free_shash(crct10dif_tfm);
+
+free_buf:
+	kfree(data);
+
+	return 0;
+}
+
+static void __exit crc_test_exit(void) {}
+
+module_init(crc_test_init);
+module_exit(crc_test_exit);
+module_param(iterations, long, 0400);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
new file mode 100644
index 000000000000..aadb59c96a27
--- /dev/null
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -0,0 +1,755 @@
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE	32768
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, R3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(R3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
index dc640b212299..d2bea48051a0 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -1,20 +1,5 @@
 /*
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * http://en.wikipedia.org/wiki/Barrett_reduction
+ * Calculate a crc32c with vpmsum acceleration
  *
  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  *
@@ -23,9 +8,6 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-
 	.section	.rodata
 .balign 16
 
@@ -33,7 +15,6 @@
 	/* byte reverse permute constant */
 	.octa 0x0F0E0D0C0B0A09080706050403020100
 
-#define MAX_SIZE	32768
 .constants:
 
 	/* Reduce 262144 kbits to 1024 bits */
@@ -860,694 +841,6 @@
 	/* 33 bit reflected Barrett constant n */
 	.octa 0x00000000000000000000000105ec76f1
 
-	.text
-
-#if defined(__BIG_ENDIAN__)
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-
-#define off16		r25
-#define off32		r26
-#define off48		r27
-#define off64		r28
-#define off80		r29
-#define off96		r30
-#define off112		r31
-
-#define const1		v24
-#define const2		v25
-
-#define byteswap	v26
-#define	mask_32bit	v27
-#define	mask_64bit	v28
-#define zeroes		v29
-
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm	A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-
-/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(__crc32c_vpmsum)
-	std	r31,-8(r1)
-	std	r30,-16(r1)
-	std	r29,-24(r1)
-	std	r28,-32(r1)
-	std	r27,-40(r1)
-	std	r26,-48(r1)
-	std	r25,-56(r1)
-
-	li	off16,16
-	li	off32,32
-	li	off48,48
-	li	off64,64
-	li	off80,80
-	li	off96,96
-	li	off112,112
-	li	r0,0
-
-	/* Enough room for saving 10 non volatile VMX registers */
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	stvx	v20,0,r6
-	stvx	v21,off16,r6
-	stvx	v22,off32,r6
-	stvx	v23,off48,r6
-	stvx	v24,off64,r6
-	stvx	v25,off80,r6
-	stvx	v26,off96,r6
-	stvx	v27,off112,r6
-	stvx	v28,0,r7
-	stvx	v29,off16,r7
-
-	mr	r10,r3
-
-	vxor	zeroes,zeroes,zeroes
-	vspltisw v0,-1
-
-	vsldoi	mask_32bit,zeroes,v0,4
-	vsldoi	mask_64bit,zeroes,v0,8
-
-	/* Get the initial value into v8 */
-	vxor	v8,v8,v8
-	MTVRD(v8, R3)
-	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
-
-#ifdef BYTESWAP_DATA
-	addis	r3,r2,.byteswap_constant@toc@ha
-	addi	r3,r3,.byteswap_constant@toc@l
-
-	lvx	byteswap,0,r3
-	addi	r3,r3,16
-#endif
-
-	cmpdi	r5,256
-	blt	.Lshort
-
-	rldicr	r6,r5,0,56
-
-	/* Checksum in blocks of MAX_SIZE */
-1:	lis	r7,MAX_SIZE@h
-	ori	r7,r7,MAX_SIZE@l
-	mr	r9,r7
-	cmpd	r6,r7
-	bgt	2f
-	mr	r7,r6
-2:	subf	r6,r7,r6
-
-	/* our main loop does 128 bytes at a time */
-	srdi	r7,r7,7
-
-	/*
-	 * Work out the offset into the constants table to start at. Each
-	 * constant is 16 bytes, and it is used against 128 bytes of input
-	 * data - 128 / 16 = 8
-	 */
-	sldi	r8,r7,4
-	srdi	r9,r9,3
-	subf	r8,r8,r9
-
-	/* We reduce our final 128 bytes in a separate step */
-	addi	r7,r7,-1
-	mtctr	r7
-
-	addis	r3,r2,.constants@toc@ha
-	addi	r3,r3,.constants@toc@l
-
-	/* Find the start of our constants */
-	add	r3,r3,r8
-
-	/* zero v0-v7 which will contain our checksums */
-	vxor	v0,v0,v0
-	vxor	v1,v1,v1
-	vxor	v2,v2,v2
-	vxor	v3,v3,v3
-	vxor	v4,v4,v4
-	vxor	v5,v5,v5
-	vxor	v6,v6,v6
-	vxor	v7,v7,v7
-
-	lvx	const1,0,r3
-
-	/*
-	 * If we are looping back to consume more data we use the values
-	 * already in v16-v23.
-	 */
-	cmpdi	r0,1
-	beq	2f
-
-	/* First warm up pass */
-	lvx	v16,0,r4
-	lvx	v17,off16,r4
-	VPERM(v16,v16,v16,byteswap)
-	VPERM(v17,v17,v17,byteswap)
-	lvx	v18,off32,r4
-	lvx	v19,off48,r4
-	VPERM(v18,v18,v18,byteswap)
-	VPERM(v19,v19,v19,byteswap)
-	lvx	v20,off64,r4
-	lvx	v21,off80,r4
-	VPERM(v20,v20,v20,byteswap)
-	VPERM(v21,v21,v21,byteswap)
-	lvx	v22,off96,r4
-	lvx	v23,off112,r4
-	VPERM(v22,v22,v22,byteswap)
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-
-	/* xor in initial value */
-	vxor	v16,v16,v8
-
-2:	bdz	.Lfirst_warm_up_done
-
-	addi	r3,r3,16
-	lvx	const2,0,r3
-
-	/* Second warm up pass */
-	VPMSUMD(v8,v16,const1)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v9,v17,const1)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v10,v18,const1)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v11,v19,const1)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdz	.Lfirst_cool_down
-
-	/*
-	 * main loop. We modulo schedule it such that it takes three iterations
-	 * to complete - first iteration load, second iteration vpmsum, third
-	 * iteration xor.
-	 */
-	.balign	16
-4:	lvx	const1,0,r3
-	addi	r3,r3,16
-	ori	r2,r2,0
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const2)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const2)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const2)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const2)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	lvx	const2,0,r3
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-
-	addi	r4,r4,8*16
-
-	bdnz	4b
-
-.Lfirst_cool_down:
-	/* First cool down pass */
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const1)
-	ori	r2,r2,0
-
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const1)
-	ori	r2,r2,0
-
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const1)
-	ori	r2,r2,0
-
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const1)
-	ori	r2,r2,0
-
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	ori	r2,r2,0
-
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	ori	r2,r2,0
-
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	ori	r2,r2,0
-
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	ori	r2,r2,0
-
-.Lsecond_cool_down:
-	/* Second cool down pass */
-	vxor	v0,v0,v8
-	vxor	v1,v1,v9
-	vxor	v2,v2,v10
-	vxor	v3,v3,v11
-	vxor	v4,v4,v12
-	vxor	v5,v5,v13
-	vxor	v6,v6,v14
-	vxor	v7,v7,v15
-
-	/*
-	 * vpmsumd produces a 96 bit result in the least significant bits
-	 * of the register. Since we are bit reflected we have to shift it
-	 * left 32 bits so it occupies the least significant bits in the
-	 * bit reflected domain.
-	 */
-	vsldoi	v0,v0,zeroes,4
-	vsldoi	v1,v1,zeroes,4
-	vsldoi	v2,v2,zeroes,4
-	vsldoi	v3,v3,zeroes,4
-	vsldoi	v4,v4,zeroes,4
-	vsldoi	v5,v5,zeroes,4
-	vsldoi	v6,v6,zeroes,4
-	vsldoi	v7,v7,zeroes,4
-
-	/* xor with last 1024 bits */
-	lvx	v8,0,r4
-	lvx	v9,off16,r4
-	VPERM(v8,v8,v8,byteswap)
-	VPERM(v9,v9,v9,byteswap)
-	lvx	v10,off32,r4
-	lvx	v11,off48,r4
-	VPERM(v10,v10,v10,byteswap)
-	VPERM(v11,v11,v11,byteswap)
-	lvx	v12,off64,r4
-	lvx	v13,off80,r4
-	VPERM(v12,v12,v12,byteswap)
-	VPERM(v13,v13,v13,byteswap)
-	lvx	v14,off96,r4
-	lvx	v15,off112,r4
-	VPERM(v14,v14,v14,byteswap)
-	VPERM(v15,v15,v15,byteswap)
-
-	addi	r4,r4,8*16
-
-	vxor	v16,v0,v8
-	vxor	v17,v1,v9
-	vxor	v18,v2,v10
-	vxor	v19,v3,v11
-	vxor	v20,v4,v12
-	vxor	v21,v5,v13
-	vxor	v22,v6,v14
-	vxor	v23,v7,v15
-
-	li	r0,1
-	cmpdi	r6,0
-	addi	r6,r6,128
-	bne	1b
-
-	/* Work out how many bytes we have left */
-	andi.	r5,r5,127
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,128
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks are in the tail */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	/*
-	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
-	 * 32 bits to include the trailing 32 bits of zeros
-	 */
-	lvx	v0,0,r3
-	lvx	v1,off16,r3
-	lvx	v2,off32,r3
-	lvx	v3,off48,r3
-	lvx	v4,off64,r3
-	lvx	v5,off80,r3
-	lvx	v6,off96,r3
-	lvx	v7,off112,r3
-	addi	r3,r3,8*16
-
-	VPMSUMW(v0,v16,v0)
-	VPMSUMW(v1,v17,v1)
-	VPMSUMW(v2,v18,v2)
-	VPMSUMW(v3,v19,v3)
-	VPMSUMW(v4,v20,v4)
-	VPMSUMW(v5,v21,v5)
-	VPMSUMW(v6,v22,v6)
-	VPMSUMW(v7,v23,v7)
-
-	/* Now reduce the tail (0 - 112 bytes) */
-	cmpdi	r7,0
-	beq	1f
-
-	lvx	v16,0,r4
-	lvx	v17,0,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off32,r4
-	lvx	v17,off32,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off64,r4
-	lvx	v17,off64,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-
-	lvx	v16,off96,r4
-	lvx	v17,off96,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-
-	/* Now xor all the parallel chunks together */
-1:	vxor	v0,v0,v1
-	vxor	v2,v2,v3
-	vxor	v4,v4,v5
-	vxor	v6,v6,v7
-
-	vxor	v0,v0,v2
-	vxor	v4,v4,v6
-
-	vxor	v0,v0,v4
-
-.Lbarrett_reduction:
-	/* Barrett constants */
-	addis	r3,r2,.barrett_constants@toc@ha
-	addi	r3,r3,.barrett_constants@toc@l
-
-	lvx	const1,0,r3
-	lvx	const2,off16,r3
-
-	vsldoi	v1,v0,v0,8
-	vxor	v0,v0,v1		/* xor two 64 bit results together */
-
-	/* shift left one bit */
-	vspltisb v1,1
-	vsl	v0,v0,v1
-
-	vand	v0,v0,mask_64bit
-
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
-	VPMSUMD(v1,v1,const1)		/* ma */
-	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
-	VPMSUMD(v1,v1,const2)		/* qn */
-	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
-
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
-
-	/* Get it into r3 */
-	MFVRD(R3, v0)
-
-.Lout:
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-
-	lvx	v20,0,r6
-	lvx	v21,off16,r6
-	lvx	v22,off32,r6
-	lvx	v23,off48,r6
-	lvx	v24,off64,r6
-	lvx	v25,off80,r6
-	lvx	v26,off96,r6
-	lvx	v27,off112,r6
-	lvx	v28,0,r7
-	lvx	v29,off16,r7
-
-	ld	r31,-8(r1)
-	ld	r30,-16(r1)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-
-	blr
-
-.Lfirst_warm_up_done:
-	lvx	const1,0,r3
-	addi	r3,r3,16
-
-	VPMSUMD(v8,v16,const1)
-	VPMSUMD(v9,v17,const1)
-	VPMSUMD(v10,v18,const1)
-	VPMSUMD(v11,v19,const1)
-	VPMSUMD(v12,v20,const1)
-	VPMSUMD(v13,v21,const1)
-	VPMSUMD(v14,v22,const1)
-	VPMSUMD(v15,v23,const1)
-
-	b	.Lsecond_cool_down
-
-.Lshort:
-	cmpdi	r5,0
-	beq	.Lzero
-
-	addis	r3,r2,.short_constants@toc@ha
-	addi	r3,r3,.short_constants@toc@l
-
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,256
-	add	r3,r3,r6
-
-	/* How many 16 byte chunks? */
-	srdi	r7,r5,4
-	mtctr	r7
-
-	vxor	v19,v19,v19
-	vxor	v20,v20,v20
-
-	lvx	v0,0,r4
-	lvx	v16,0,r3
-	VPERM(v0,v0,v16,byteswap)
-	vxor	v0,v0,v8	/* xor in initial value */
-	VPMSUMW(v0,v0,v16)
-	bdz	.Lv0
-
-	lvx	v1,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v1,v1,v17,byteswap)
-	VPMSUMW(v1,v1,v17)
-	bdz	.Lv1
-
-	lvx	v2,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v2,v2,v16,byteswap)
-	VPMSUMW(v2,v2,v16)
-	bdz	.Lv2
-
-	lvx	v3,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v3,v3,v17,byteswap)
-	VPMSUMW(v3,v3,v17)
-	bdz	.Lv3
-
-	lvx	v4,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v4,v4,v16,byteswap)
-	VPMSUMW(v4,v4,v16)
-	bdz	.Lv4
-
-	lvx	v5,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v5,v5,v17,byteswap)
-	VPMSUMW(v5,v5,v17)
-	bdz	.Lv5
-
-	lvx	v6,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v6,v6,v16,byteswap)
-	VPMSUMW(v6,v6,v16)
-	bdz	.Lv6
-
-	lvx	v7,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v7,v7,v17,byteswap)
-	VPMSUMW(v7,v7,v17)
-	bdz	.Lv7
-
-	addi	r3,r3,128
-	addi	r4,r4,128
-
-	lvx	v8,0,r4
-	lvx	v16,0,r3
-	VPERM(v8,v8,v16,byteswap)
-	VPMSUMW(v8,v8,v16)
-	bdz	.Lv8
-
-	lvx	v9,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v9,v9,v17,byteswap)
-	VPMSUMW(v9,v9,v17)
-	bdz	.Lv9
-
-	lvx	v10,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v10,v10,v16,byteswap)
-	VPMSUMW(v10,v10,v16)
-	bdz	.Lv10
-
-	lvx	v11,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v11,v11,v17,byteswap)
-	VPMSUMW(v11,v11,v17)
-	bdz	.Lv11
-
-	lvx	v12,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v12,v12,v16,byteswap)
-	VPMSUMW(v12,v12,v16)
-	bdz	.Lv12
-
-	lvx	v13,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v13,v13,v17,byteswap)
-	VPMSUMW(v13,v13,v17)
-	bdz	.Lv13
-
-	lvx	v14,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v14,v14,v16,byteswap)
-	VPMSUMW(v14,v14,v16)
-	bdz	.Lv14
-
-	lvx	v15,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v15,v15,v17,byteswap)
-	VPMSUMW(v15,v15,v17)
-
-.Lv15:	vxor	v19,v19,v15
-.Lv14:	vxor	v20,v20,v14
-.Lv13:	vxor	v19,v19,v13
-.Lv12:	vxor	v20,v20,v12
-.Lv11:	vxor	v19,v19,v11
-.Lv10:	vxor	v20,v20,v10
-.Lv9:	vxor	v19,v19,v9
-.Lv8:	vxor	v20,v20,v8
-.Lv7:	vxor	v19,v19,v7
-.Lv6:	vxor	v20,v20,v6
-.Lv5:	vxor	v19,v19,v5
-.Lv4:	vxor	v20,v20,v4
-.Lv3:	vxor	v19,v19,v3
-.Lv2:	vxor	v20,v20,v2
-.Lv1:	vxor	v19,v19,v1
-.Lv0:	vxor	v20,v20,v0
-
-	vxor	v0,v19,v20
-
-	b	.Lbarrett_reduction
-
-.Lzero:
-	mr	r3,r10
-	b	.Lout
-
-FUNC_END(__crc32_vpmsum)
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
+#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000000..5e3d81a0af1b
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
@@ -0,0 +1,850 @@
+/*
+ * Calculate a CRC T10DIF  with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ * and is available under the GPL v2 or later.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+	/* Reduce 262144 kbits to 1024 bits */
+	/* x^261184 mod p(x), x^261120 mod p(x) */
+	.octa 0x0000000056d300000000000052550000
+
+	/* x^260160 mod p(x), x^260096 mod p(x) */
+	.octa 0x00000000ee67000000000000a1e40000
+
+	/* x^259136 mod p(x), x^259072 mod p(x) */
+	.octa 0x0000000060830000000000004ad10000
+
+	/* x^258112 mod p(x), x^258048 mod p(x) */
+	.octa 0x000000008cfe0000000000009ab40000
+
+	/* x^257088 mod p(x), x^257024 mod p(x) */
+	.octa 0x000000003e93000000000000fdb50000
+
+	/* x^256064 mod p(x), x^256000 mod p(x) */
+	.octa 0x000000003c2000000000000045480000
+
+	/* x^255040 mod p(x), x^254976 mod p(x) */
+	.octa 0x00000000b1fc0000000000008d690000
+
+	/* x^254016 mod p(x), x^253952 mod p(x) */
+	.octa 0x00000000f82b00000000000024ad0000
+
+	/* x^252992 mod p(x), x^252928 mod p(x) */
+	.octa 0x0000000044420000000000009f1a0000
+
+	/* x^251968 mod p(x), x^251904 mod p(x) */
+	.octa 0x00000000e88c00000000000066ec0000
+
+	/* x^250944 mod p(x), x^250880 mod p(x) */
+	.octa 0x00000000385c000000000000c87d0000
+
+	/* x^249920 mod p(x), x^249856 mod p(x) */
+	.octa 0x000000003227000000000000c8ff0000
+
+	/* x^248896 mod p(x), x^248832 mod p(x) */
+	.octa 0x00000000a9a900000000000033440000
+
+	/* x^247872 mod p(x), x^247808 mod p(x) */
+	.octa 0x00000000abaa00000000000066eb0000
+
+	/* x^246848 mod p(x), x^246784 mod p(x) */
+	.octa 0x000000001ac3000000000000c4ef0000
+
+	/* x^245824 mod p(x), x^245760 mod p(x) */
+	.octa 0x0000000063f000000000000056f30000
+
+	/* x^244800 mod p(x), x^244736 mod p(x) */
+	.octa 0x0000000032cc00000000000002050000
+
+	/* x^243776 mod p(x), x^243712 mod p(x) */
+	.octa 0x00000000f8b5000000000000568e0000
+
+	/* x^242752 mod p(x), x^242688 mod p(x) */
+	.octa 0x000000008db100000000000064290000
+
+	/* x^241728 mod p(x), x^241664 mod p(x) */
+	.octa 0x0000000059ca0000000000006b660000
+
+	/* x^240704 mod p(x), x^240640 mod p(x) */
+	.octa 0x000000005f5c00000000000018f80000
+
+	/* x^239680 mod p(x), x^239616 mod p(x) */
+	.octa 0x0000000061af000000000000b6090000
+
+	/* x^238656 mod p(x), x^238592 mod p(x) */
+	.octa 0x00000000e29e000000000000099a0000
+
+	/* x^237632 mod p(x), x^237568 mod p(x) */
+	.octa 0x000000000975000000000000a8360000
+
+	/* x^236608 mod p(x), x^236544 mod p(x) */
+	.octa 0x0000000043900000000000004f570000
+
+	/* x^235584 mod p(x), x^235520 mod p(x) */
+	.octa 0x00000000f9cd000000000000134c0000
+
+	/* x^234560 mod p(x), x^234496 mod p(x) */
+	.octa 0x000000007c29000000000000ec380000
+
+	/* x^233536 mod p(x), x^233472 mod p(x) */
+	.octa 0x000000004c6a000000000000b0d10000
+
+	/* x^232512 mod p(x), x^232448 mod p(x) */
+	.octa 0x00000000e7290000000000007d3e0000
+
+	/* x^231488 mod p(x), x^231424 mod p(x) */
+	.octa 0x00000000f1ab000000000000f0b20000
+
+	/* x^230464 mod p(x), x^230400 mod p(x) */
+	.octa 0x0000000039db0000000000009c270000
+
+	/* x^229440 mod p(x), x^229376 mod p(x) */
+	.octa 0x000000005e2800000000000092890000
+
+	/* x^228416 mod p(x), x^228352 mod p(x) */
+	.octa 0x00000000d44e000000000000d5ee0000
+
+	/* x^227392 mod p(x), x^227328 mod p(x) */
+	.octa 0x00000000cd0a00000000000041f50000
+
+	/* x^226368 mod p(x), x^226304 mod p(x) */
+	.octa 0x00000000c5b400000000000010520000
+
+	/* x^225344 mod p(x), x^225280 mod p(x) */
+	.octa 0x00000000fd2100000000000042170000
+
+	/* x^224320 mod p(x), x^224256 mod p(x) */
+	.octa 0x000000002f2500000000000095c20000
+
+	/* x^223296 mod p(x), x^223232 mod p(x) */
+	.octa 0x000000001b0100000000000001ce0000
+
+	/* x^222272 mod p(x), x^222208 mod p(x) */
+	.octa 0x000000000d430000000000002aca0000
+
+	/* x^221248 mod p(x), x^221184 mod p(x) */
+	.octa 0x0000000030a6000000000000385e0000
+
+	/* x^220224 mod p(x), x^220160 mod p(x) */
+	.octa 0x00000000e37b0000000000006f7a0000
+
+	/* x^219200 mod p(x), x^219136 mod p(x) */
+	.octa 0x00000000873600000000000024320000
+
+	/* x^218176 mod p(x), x^218112 mod p(x) */
+	.octa 0x00000000e9fb000000000000bd9c0000
+
+	/* x^217152 mod p(x), x^217088 mod p(x) */
+	.octa 0x000000003b9500000000000054bc0000
+
+	/* x^216128 mod p(x), x^216064 mod p(x) */
+	.octa 0x00000000133e000000000000a4660000
+
+	/* x^215104 mod p(x), x^215040 mod p(x) */
+	.octa 0x00000000784500000000000079930000
+
+	/* x^214080 mod p(x), x^214016 mod p(x) */
+	.octa 0x00000000b9800000000000001bb80000
+
+	/* x^213056 mod p(x), x^212992 mod p(x) */
+	.octa 0x00000000687600000000000024400000
+
+	/* x^212032 mod p(x), x^211968 mod p(x) */
+	.octa 0x00000000aff300000000000029e10000
+
+	/* x^211008 mod p(x), x^210944 mod p(x) */
+	.octa 0x0000000024b50000000000005ded0000
+
+	/* x^209984 mod p(x), x^209920 mod p(x) */
+	.octa 0x0000000017e8000000000000b12e0000
+
+	/* x^208960 mod p(x), x^208896 mod p(x) */
+	.octa 0x00000000128400000000000026d20000
+
+	/* x^207936 mod p(x), x^207872 mod p(x) */
+	.octa 0x000000002115000000000000a32a0000
+
+	/* x^206912 mod p(x), x^206848 mod p(x) */
+	.octa 0x000000009595000000000000a1210000
+
+	/* x^205888 mod p(x), x^205824 mod p(x) */
+	.octa 0x00000000281e000000000000ee8b0000
+
+	/* x^204864 mod p(x), x^204800 mod p(x) */
+	.octa 0x0000000006010000000000003d0d0000
+
+	/* x^203840 mod p(x), x^203776 mod p(x) */
+	.octa 0x00000000e2b600000000000034e90000
+
+	/* x^202816 mod p(x), x^202752 mod p(x) */
+	.octa 0x000000001bd40000000000004cdb0000
+
+	/* x^201792 mod p(x), x^201728 mod p(x) */
+	.octa 0x00000000df2800000000000030e90000
+
+	/* x^200768 mod p(x), x^200704 mod p(x) */
+	.octa 0x0000000049c200000000000042590000
+
+	/* x^199744 mod p(x), x^199680 mod p(x) */
+	.octa 0x000000009b97000000000000df950000
+
+	/* x^198720 mod p(x), x^198656 mod p(x) */
+	.octa 0x000000006184000000000000da7b0000
+
+	/* x^197696 mod p(x), x^197632 mod p(x) */
+	.octa 0x00000000461700000000000012510000
+
+	/* x^196672 mod p(x), x^196608 mod p(x) */
+	.octa 0x000000009b40000000000000f37e0000
+
+	/* x^195648 mod p(x), x^195584 mod p(x) */
+	.octa 0x00000000eeb2000000000000ecf10000
+
+	/* x^194624 mod p(x), x^194560 mod p(x) */
+	.octa 0x00000000b2e800000000000050f20000
+
+	/* x^193600 mod p(x), x^193536 mod p(x) */
+	.octa 0x00000000f59a000000000000e0b30000
+
+	/* x^192576 mod p(x), x^192512 mod p(x) */
+	.octa 0x00000000467f0000000000004d5a0000
+
+	/* x^191552 mod p(x), x^191488 mod p(x) */
+	.octa 0x00000000da92000000000000bb010000
+
+	/* x^190528 mod p(x), x^190464 mod p(x) */
+	.octa 0x000000001e1000000000000022a40000
+
+	/* x^189504 mod p(x), x^189440 mod p(x) */
+	.octa 0x0000000058fe000000000000836f0000
+
+	/* x^188480 mod p(x), x^188416 mod p(x) */
+	.octa 0x00000000b9ce000000000000d78d0000
+
+	/* x^187456 mod p(x), x^187392 mod p(x) */
+	.octa 0x0000000022210000000000004f8d0000
+
+	/* x^186432 mod p(x), x^186368 mod p(x) */
+	.octa 0x00000000744600000000000033760000
+
+	/* x^185408 mod p(x), x^185344 mod p(x) */
+	.octa 0x000000001c2e000000000000a1e50000
+
+	/* x^184384 mod p(x), x^184320 mod p(x) */
+	.octa 0x00000000dcc8000000000000a1a40000
+
+	/* x^183360 mod p(x), x^183296 mod p(x) */
+	.octa 0x00000000910f00000000000019a20000
+
+	/* x^182336 mod p(x), x^182272 mod p(x) */
+	.octa 0x0000000055d5000000000000f6ae0000
+
+	/* x^181312 mod p(x), x^181248 mod p(x) */
+	.octa 0x00000000c8ba000000000000a7ac0000
+
+	/* x^180288 mod p(x), x^180224 mod p(x) */
+	.octa 0x0000000031f8000000000000eea20000
+
+	/* x^179264 mod p(x), x^179200 mod p(x) */
+	.octa 0x000000001966000000000000c4d90000
+
+	/* x^178240 mod p(x), x^178176 mod p(x) */
+	.octa 0x00000000b9810000000000002b470000
+
+	/* x^177216 mod p(x), x^177152 mod p(x) */
+	.octa 0x000000008303000000000000f7cf0000
+
+	/* x^176192 mod p(x), x^176128 mod p(x) */
+	.octa 0x000000002ce500000000000035b30000
+
+	/* x^175168 mod p(x), x^175104 mod p(x) */
+	.octa 0x000000002fae0000000000000c7c0000
+
+	/* x^174144 mod p(x), x^174080 mod p(x) */
+	.octa 0x00000000f50c0000000000009edf0000
+
+	/* x^173120 mod p(x), x^173056 mod p(x) */
+	.octa 0x00000000714f00000000000004cd0000
+
+	/* x^172096 mod p(x), x^172032 mod p(x) */
+	.octa 0x00000000c161000000000000541b0000
+
+	/* x^171072 mod p(x), x^171008 mod p(x) */
+	.octa 0x0000000021c8000000000000e2700000
+
+	/* x^170048 mod p(x), x^169984 mod p(x) */
+	.octa 0x00000000b93d00000000000009a60000
+
+	/* x^169024 mod p(x), x^168960 mod p(x) */
+	.octa 0x00000000fbcf000000000000761c0000
+
+	/* x^168000 mod p(x), x^167936 mod p(x) */
+	.octa 0x0000000026350000000000009db30000
+
+	/* x^166976 mod p(x), x^166912 mod p(x) */
+	.octa 0x00000000b64f0000000000003e9f0000
+
+	/* x^165952 mod p(x), x^165888 mod p(x) */
+	.octa 0x00000000bd0e00000000000078590000
+
+	/* x^164928 mod p(x), x^164864 mod p(x) */
+	.octa 0x00000000d9360000000000008bc80000
+
+	/* x^163904 mod p(x), x^163840 mod p(x) */
+	.octa 0x000000002f140000000000008c9f0000
+
+	/* x^162880 mod p(x), x^162816 mod p(x) */
+	.octa 0x000000006a270000000000006af70000
+
+	/* x^161856 mod p(x), x^161792 mod p(x) */
+	.octa 0x000000006685000000000000e5210000
+
+	/* x^160832 mod p(x), x^160768 mod p(x) */
+	.octa 0x0000000062da00000000000008290000
+
+	/* x^159808 mod p(x), x^159744 mod p(x) */
+	.octa 0x00000000bb4b000000000000e4d00000
+
+	/* x^158784 mod p(x), x^158720 mod p(x) */
+	.octa 0x00000000d2490000000000004ae10000
+
+	/* x^157760 mod p(x), x^157696 mod p(x) */
+	.octa 0x00000000c85b00000000000000e70000
+
+	/* x^156736 mod p(x), x^156672 mod p(x) */
+	.octa 0x00000000c37a00000000000015650000
+
+	/* x^155712 mod p(x), x^155648 mod p(x) */
+	.octa 0x0000000018530000000000001c2f0000
+
+	/* x^154688 mod p(x), x^154624 mod p(x) */
+	.octa 0x00000000b46600000000000037bd0000
+
+	/* x^153664 mod p(x), x^153600 mod p(x) */
+	.octa 0x00000000439b00000000000012190000
+
+	/* x^152640 mod p(x), x^152576 mod p(x) */
+	.octa 0x00000000b1260000000000005ece0000
+
+	/* x^151616 mod p(x), x^151552 mod p(x) */
+	.octa 0x00000000d8110000000000002a5e0000
+
+	/* x^150592 mod p(x), x^150528 mod p(x) */
+	.octa 0x00000000099f00000000000052330000
+
+	/* x^149568 mod p(x), x^149504 mod p(x) */
+	.octa 0x00000000f9f9000000000000f9120000
+
+	/* x^148544 mod p(x), x^148480 mod p(x) */
+	.octa 0x000000005cc00000000000000ddc0000
+
+	/* x^147520 mod p(x), x^147456 mod p(x) */
+	.octa 0x00000000343b00000000000012200000
+
+	/* x^146496 mod p(x), x^146432 mod p(x) */
+	.octa 0x000000009222000000000000d12b0000
+
+	/* x^145472 mod p(x), x^145408 mod p(x) */
+	.octa 0x00000000d781000000000000eb2d0000
+
+	/* x^144448 mod p(x), x^144384 mod p(x) */
+	.octa 0x000000000bf400000000000058970000
+
+	/* x^143424 mod p(x), x^143360 mod p(x) */
+	.octa 0x00000000094200000000000013690000
+
+	/* x^142400 mod p(x), x^142336 mod p(x) */
+	.octa 0x00000000d55100000000000051950000
+
+	/* x^141376 mod p(x), x^141312 mod p(x) */
+	.octa 0x000000008f11000000000000954b0000
+
+	/* x^140352 mod p(x), x^140288 mod p(x) */
+	.octa 0x00000000140f000000000000b29e0000
+
+	/* x^139328 mod p(x), x^139264 mod p(x) */
+	.octa 0x00000000c6db000000000000db5d0000
+
+	/* x^138304 mod p(x), x^138240 mod p(x) */
+	.octa 0x00000000715b000000000000dfaf0000
+
+	/* x^137280 mod p(x), x^137216 mod p(x) */
+	.octa 0x000000000dea000000000000e3b60000
+
+	/* x^136256 mod p(x), x^136192 mod p(x) */
+	.octa 0x000000006f94000000000000ddaf0000
+
+	/* x^135232 mod p(x), x^135168 mod p(x) */
+	.octa 0x0000000024e1000000000000e4f70000
+
+	/* x^134208 mod p(x), x^134144 mod p(x) */
+	.octa 0x000000008810000000000000aa110000
+
+	/* x^133184 mod p(x), x^133120 mod p(x) */
+	.octa 0x0000000030c2000000000000a8e60000
+
+	/* x^132160 mod p(x), x^132096 mod p(x) */
+	.octa 0x00000000e6d0000000000000ccf30000
+
+	/* x^131136 mod p(x), x^131072 mod p(x) */
+	.octa 0x000000004da000000000000079bf0000
+
+	/* x^130112 mod p(x), x^130048 mod p(x) */
+	.octa 0x000000007759000000000000b3a30000
+
+	/* x^129088 mod p(x), x^129024 mod p(x) */
+	.octa 0x00000000597400000000000028790000
+
+	/* x^128064 mod p(x), x^128000 mod p(x) */
+	.octa 0x000000007acd000000000000b5820000
+
+	/* x^127040 mod p(x), x^126976 mod p(x) */
+	.octa 0x00000000e6e400000000000026ad0000
+
+	/* x^126016 mod p(x), x^125952 mod p(x) */
+	.octa 0x000000006d49000000000000985b0000
+
+	/* x^124992 mod p(x), x^124928 mod p(x) */
+	.octa 0x000000000f0800000000000011520000
+
+	/* x^123968 mod p(x), x^123904 mod p(x) */
+	.octa 0x000000002c7f000000000000846c0000
+
+	/* x^122944 mod p(x), x^122880 mod p(x) */
+	.octa 0x000000005ce7000000000000ae1d0000
+
+	/* x^121920 mod p(x), x^121856 mod p(x) */
+	.octa 0x00000000d4cb000000000000e21d0000
+
+	/* x^120896 mod p(x), x^120832 mod p(x) */
+	.octa 0x000000003a2300000000000019bb0000
+
+	/* x^119872 mod p(x), x^119808 mod p(x) */
+	.octa 0x000000000e1700000000000095290000
+
+	/* x^118848 mod p(x), x^118784 mod p(x) */
+	.octa 0x000000006e6400000000000050d20000
+
+	/* x^117824 mod p(x), x^117760 mod p(x) */
+	.octa 0x000000008d5c0000000000000cd10000
+
+	/* x^116800 mod p(x), x^116736 mod p(x) */
+	.octa 0x00000000ef310000000000007b570000
+
+	/* x^115776 mod p(x), x^115712 mod p(x) */
+	.octa 0x00000000645d00000000000053d60000
+
+	/* x^114752 mod p(x), x^114688 mod p(x) */
+	.octa 0x0000000018fc00000000000077510000
+
+	/* x^113728 mod p(x), x^113664 mod p(x) */
+	.octa 0x000000000cb3000000000000a7b70000
+
+	/* x^112704 mod p(x), x^112640 mod p(x) */
+	.octa 0x00000000991b000000000000d0780000
+
+	/* x^111680 mod p(x), x^111616 mod p(x) */
+	.octa 0x00000000845a000000000000be3c0000
+
+	/* x^110656 mod p(x), x^110592 mod p(x) */
+	.octa 0x00000000d3a9000000000000df020000
+
+	/* x^109632 mod p(x), x^109568 mod p(x) */
+	.octa 0x0000000017d7000000000000063e0000
+
+	/* x^108608 mod p(x), x^108544 mod p(x) */
+	.octa 0x000000007a860000000000008ab40000
+
+	/* x^107584 mod p(x), x^107520 mod p(x) */
+	.octa 0x00000000fd7c000000000000c7bd0000
+
+	/* x^106560 mod p(x), x^106496 mod p(x) */
+	.octa 0x00000000a56b000000000000efd60000
+
+	/* x^105536 mod p(x), x^105472 mod p(x) */
+	.octa 0x0000000010e400000000000071380000
+
+	/* x^104512 mod p(x), x^104448 mod p(x) */
+	.octa 0x00000000994500000000000004d30000
+
+	/* x^103488 mod p(x), x^103424 mod p(x) */
+	.octa 0x00000000b83c0000000000003b0e0000
+
+	/* x^102464 mod p(x), x^102400 mod p(x) */
+	.octa 0x00000000d6c10000000000008b020000
+
+	/* x^101440 mod p(x), x^101376 mod p(x) */
+	.octa 0x000000009efc000000000000da940000
+
+	/* x^100416 mod p(x), x^100352 mod p(x) */
+	.octa 0x000000005e87000000000000f9f70000
+
+	/* x^99392 mod p(x), x^99328 mod p(x) */
+	.octa 0x000000006c9b00000000000045e40000
+
+	/* x^98368 mod p(x), x^98304 mod p(x) */
+	.octa 0x00000000178a00000000000083940000
+
+	/* x^97344 mod p(x), x^97280 mod p(x) */
+	.octa 0x00000000f0c8000000000000f0a00000
+
+	/* x^96320 mod p(x), x^96256 mod p(x) */
+	.octa 0x00000000f699000000000000b74b0000
+
+	/* x^95296 mod p(x), x^95232 mod p(x) */
+	.octa 0x00000000316d000000000000c1cf0000
+
+	/* x^94272 mod p(x), x^94208 mod p(x) */
+	.octa 0x00000000987e00000000000072680000
+
+	/* x^93248 mod p(x), x^93184 mod p(x) */
+	.octa 0x00000000acff000000000000e0ab0000
+
+	/* x^92224 mod p(x), x^92160 mod p(x) */
+	.octa 0x00000000a1f6000000000000c5a80000
+
+	/* x^91200 mod p(x), x^91136 mod p(x) */
+	.octa 0x0000000061bd000000000000cf690000
+
+	/* x^90176 mod p(x), x^90112 mod p(x) */
+	.octa 0x00000000c9f2000000000000cbcc0000
+
+	/* x^89152 mod p(x), x^89088 mod p(x) */
+	.octa 0x000000005a33000000000000de050000
+
+	/* x^88128 mod p(x), x^88064 mod p(x) */
+	.octa 0x00000000e416000000000000ccd70000
+
+	/* x^87104 mod p(x), x^87040 mod p(x) */
+	.octa 0x0000000058930000000000002f670000
+
+	/* x^86080 mod p(x), x^86016 mod p(x) */
+	.octa 0x00000000a9d3000000000000152f0000
+
+	/* x^85056 mod p(x), x^84992 mod p(x) */
+	.octa 0x00000000c114000000000000ecc20000
+
+	/* x^84032 mod p(x), x^83968 mod p(x) */
+	.octa 0x00000000b9270000000000007c890000
+
+	/* x^83008 mod p(x), x^82944 mod p(x) */
+	.octa 0x000000002e6000000000000006ee0000
+
+	/* x^81984 mod p(x), x^81920 mod p(x) */
+	.octa 0x00000000dfc600000000000009100000
+
+	/* x^80960 mod p(x), x^80896 mod p(x) */
+	.octa 0x000000004911000000000000ad4e0000
+
+	/* x^79936 mod p(x), x^79872 mod p(x) */
+	.octa 0x00000000ae1b000000000000b04d0000
+
+	/* x^78912 mod p(x), x^78848 mod p(x) */
+	.octa 0x0000000005fa000000000000e9900000
+
+	/* x^77888 mod p(x), x^77824 mod p(x) */
+	.octa 0x0000000004a1000000000000cc6f0000
+
+	/* x^76864 mod p(x), x^76800 mod p(x) */
+	.octa 0x00000000af73000000000000ed110000
+
+	/* x^75840 mod p(x), x^75776 mod p(x) */
+	.octa 0x0000000082530000000000008f7e0000
+
+	/* x^74816 mod p(x), x^74752 mod p(x) */
+	.octa 0x00000000cfdc000000000000594f0000
+
+	/* x^73792 mod p(x), x^73728 mod p(x) */
+	.octa 0x00000000a6b6000000000000a8750000
+
+	/* x^72768 mod p(x), x^72704 mod p(x) */
+	.octa 0x00000000fd76000000000000aa0c0000
+
+	/* x^71744 mod p(x), x^71680 mod p(x) */
+	.octa 0x0000000006f500000000000071db0000
+
+	/* x^70720 mod p(x), x^70656 mod p(x) */
+	.octa 0x0000000037ca000000000000ab0c0000
+
+	/* x^69696 mod p(x), x^69632 mod p(x) */
+	.octa 0x00000000d7ab000000000000b7a00000
+
+	/* x^68672 mod p(x), x^68608 mod p(x) */
+	.octa 0x00000000440800000000000090d30000
+
+	/* x^67648 mod p(x), x^67584 mod p(x) */
+	.octa 0x00000000186100000000000054730000
+
+	/* x^66624 mod p(x), x^66560 mod p(x) */
+	.octa 0x000000007368000000000000a3a20000
+
+	/* x^65600 mod p(x), x^65536 mod p(x) */
+	.octa 0x0000000026d0000000000000f9040000
+
+	/* x^64576 mod p(x), x^64512 mod p(x) */
+	.octa 0x00000000fe770000000000009c0a0000
+
+	/* x^63552 mod p(x), x^63488 mod p(x) */
+	.octa 0x000000002cba000000000000d1e70000
+
+	/* x^62528 mod p(x), x^62464 mod p(x) */
+	.octa 0x00000000f8bd0000000000005ac10000
+
+	/* x^61504 mod p(x), x^61440 mod p(x) */
+	.octa 0x000000007372000000000000d68d0000
+
+	/* x^60480 mod p(x), x^60416 mod p(x) */
+	.octa 0x00000000f37f00000000000089f60000
+
+	/* x^59456 mod p(x), x^59392 mod p(x) */
+	.octa 0x00000000078400000000000008a90000
+
+	/* x^58432 mod p(x), x^58368 mod p(x) */
+	.octa 0x00000000d3e400000000000042360000
+
+	/* x^57408 mod p(x), x^57344 mod p(x) */
+	.octa 0x00000000eba800000000000092d50000
+
+	/* x^56384 mod p(x), x^56320 mod p(x) */
+	.octa 0x00000000afbe000000000000b4d50000
+
+	/* x^55360 mod p(x), x^55296 mod p(x) */
+	.octa 0x00000000d8ca000000000000c9060000
+
+	/* x^54336 mod p(x), x^54272 mod p(x) */
+	.octa 0x00000000c2d00000000000008f4f0000
+
+	/* x^53312 mod p(x), x^53248 mod p(x) */
+	.octa 0x00000000373200000000000028690000
+
+	/* x^52288 mod p(x), x^52224 mod p(x) */
+	.octa 0x0000000046ae000000000000c3b30000
+
+	/* x^51264 mod p(x), x^51200 mod p(x) */
+	.octa 0x00000000b243000000000000f8700000
+
+	/* x^50240 mod p(x), x^50176 mod p(x) */
+	.octa 0x00000000f7f500000000000029eb0000
+
+	/* x^49216 mod p(x), x^49152 mod p(x) */
+	.octa 0x000000000c7e000000000000fe730000
+
+	/* x^48192 mod p(x), x^48128 mod p(x) */
+	.octa 0x00000000c38200000000000096000000
+
+	/* x^47168 mod p(x), x^47104 mod p(x) */
+	.octa 0x000000008956000000000000683c0000
+
+	/* x^46144 mod p(x), x^46080 mod p(x) */
+	.octa 0x00000000422d0000000000005f1e0000
+
+	/* x^45120 mod p(x), x^45056 mod p(x) */
+	.octa 0x00000000ac0f0000000000006f810000
+
+	/* x^44096 mod p(x), x^44032 mod p(x) */
+	.octa 0x00000000ce30000000000000031f0000
+
+	/* x^43072 mod p(x), x^43008 mod p(x) */
+	.octa 0x000000003d43000000000000455a0000
+
+	/* x^42048 mod p(x), x^41984 mod p(x) */
+	.octa 0x000000007ebe000000000000a6050000
+
+	/* x^41024 mod p(x), x^40960 mod p(x) */
+	.octa 0x00000000976e00000000000077eb0000
+
+	/* x^40000 mod p(x), x^39936 mod p(x) */
+	.octa 0x000000000872000000000000389c0000
+
+	/* x^38976 mod p(x), x^38912 mod p(x) */
+	.octa 0x000000008979000000000000c7b20000
+
+	/* x^37952 mod p(x), x^37888 mod p(x) */
+	.octa 0x000000005c1e0000000000001d870000
+
+	/* x^36928 mod p(x), x^36864 mod p(x) */
+	.octa 0x00000000aebb00000000000045810000
+
+	/* x^35904 mod p(x), x^35840 mod p(x) */
+	.octa 0x000000004f7e0000000000006d4a0000
+
+	/* x^34880 mod p(x), x^34816 mod p(x) */
+	.octa 0x00000000ea98000000000000b9200000
+
+	/* x^33856 mod p(x), x^33792 mod p(x) */
+	.octa 0x00000000f39600000000000022f20000
+
+	/* x^32832 mod p(x), x^32768 mod p(x) */
+	.octa 0x000000000bc500000000000041ca0000
+
+	/* x^31808 mod p(x), x^31744 mod p(x) */
+	.octa 0x00000000786400000000000078500000
+
+	/* x^30784 mod p(x), x^30720 mod p(x) */
+	.octa 0x00000000be970000000000009e7e0000
+
+	/* x^29760 mod p(x), x^29696 mod p(x) */
+	.octa 0x00000000dd6d000000000000a53c0000
+
+	/* x^28736 mod p(x), x^28672 mod p(x) */
+	.octa 0x000000004c3f00000000000039340000
+
+	/* x^27712 mod p(x), x^27648 mod p(x) */
+	.octa 0x0000000093a4000000000000b58e0000
+
+	/* x^26688 mod p(x), x^26624 mod p(x) */
+	.octa 0x0000000050fb00000000000062d40000
+
+	/* x^25664 mod p(x), x^25600 mod p(x) */
+	.octa 0x00000000f505000000000000a26f0000
+
+	/* x^24640 mod p(x), x^24576 mod p(x) */
+	.octa 0x0000000064f900000000000065e60000
+
+	/* x^23616 mod p(x), x^23552 mod p(x) */
+	.octa 0x00000000e8c2000000000000aad90000
+
+	/* x^22592 mod p(x), x^22528 mod p(x) */
+	.octa 0x00000000720b000000000000a3b00000
+
+	/* x^21568 mod p(x), x^21504 mod p(x) */
+	.octa 0x00000000e992000000000000d2680000
+
+	/* x^20544 mod p(x), x^20480 mod p(x) */
+	.octa 0x000000009132000000000000cf4c0000
+
+	/* x^19520 mod p(x), x^19456 mod p(x) */
+	.octa 0x00000000608a00000000000076610000
+
+	/* x^18496 mod p(x), x^18432 mod p(x) */
+	.octa 0x000000009948000000000000fb9f0000
+
+	/* x^17472 mod p(x), x^17408 mod p(x) */
+	.octa 0x00000000173000000000000003770000
+
+	/* x^16448 mod p(x), x^16384 mod p(x) */
+	.octa 0x000000006fe300000000000004880000
+
+	/* x^15424 mod p(x), x^15360 mod p(x) */
+	.octa 0x00000000e15300000000000056a70000
+
+	/* x^14400 mod p(x), x^14336 mod p(x) */
+	.octa 0x0000000092d60000000000009dfd0000
+
+	/* x^13376 mod p(x), x^13312 mod p(x) */
+	.octa 0x0000000002fd00000000000074c80000
+
+	/* x^12352 mod p(x), x^12288 mod p(x) */
+	.octa 0x00000000c78b000000000000a3ec0000
+
+	/* x^11328 mod p(x), x^11264 mod p(x) */
+	.octa 0x000000009262000000000000b3530000
+
+	/* x^10304 mod p(x), x^10240 mod p(x) */
+	.octa 0x0000000084f200000000000047bf0000
+
+	/* x^9280 mod p(x), x^9216 mod p(x) */
+	.octa 0x0000000067ee000000000000e97c0000
+
+	/* x^8256 mod p(x), x^8192 mod p(x) */
+	.octa 0x00000000535b00000000000091e10000
+
+	/* x^7232 mod p(x), x^7168 mod p(x) */
+	.octa 0x000000007ebb00000000000055060000
+
+	/* x^6208 mod p(x), x^6144 mod p(x) */
+	.octa 0x00000000c6a1000000000000fd360000
+
+	/* x^5184 mod p(x), x^5120 mod p(x) */
+	.octa 0x000000001be500000000000055860000
+
+	/* x^4160 mod p(x), x^4096 mod p(x) */
+	.octa 0x00000000ae0e0000000000005bd00000
+
+	/* x^3136 mod p(x), x^3072 mod p(x) */
+	.octa 0x0000000022040000000000008db20000
+
+	/* x^2112 mod p(x), x^2048 mod p(x) */
+	.octa 0x00000000c9eb000000000000efe20000
+
+	/* x^1088 mod p(x), x^1024 mod p(x) */
+	.octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+	/* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+	.octa 0xefe20000dccf00009440000033590000
+
+	/* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+	.octa 0xee6300002f3f000062180000e0ed0000
+
+	/* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+	.octa 0xcf5f000017ef0000ccbe000023d30000
+
+	/* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+	.octa 0x6d0c0000a30e00000920000042630000
+
+	/* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+	.octa 0x21d30000932b0000a7a00000efcc0000
+
+	/* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+	.octa 0x10be00000b310000666f00000d1c0000
+
+	/* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+	.octa 0x1f240000ce9e0000caad0000589e0000
+
+	/* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+	.octa 0x29610000d02b000039b400007cf50000
+
+	/* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+	.octa 0x51d100009d9d00003c0e0000bfd60000
+
+	/* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+	.octa 0xda390000ceae000013830000713c0000
+
+	/* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+	.octa 0xb67800001e16000085c0000080a60000
+
+	/* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+	.octa 0x0db40000f7f90000371d0000e6580000
+
+	/* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+	.octa 0x87e70000044c0000aadb0000a4970000
+
+	/* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+	.octa 0x1f990000ad180000d8b30000e7b50000
+
+	/* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+	.octa 0xbe6c00006ee300004c1a000006df0000
+
+	/* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+	.octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+	/* Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000001f65a57f8	/* x^64 div p(x) */
+	/* Barrett constant n */
+	.octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
new file mode 100644
index 000000000000..02ea277863d1
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
@@ -0,0 +1,128 @@
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+#define VMX_ALIGN		16
+#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT	64
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+	u32 crc = crci;
+
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+		return crc_t10dif_generic(crc, p, len);
+
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = crc_t10dif_generic(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+
+	if (len & ~VMX_ALIGN_MASK) {
+		crc <<= 16;
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_altivec();
+		crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+		disable_kernel_altivec();
+		pagefault_enable();
+		preempt_enable();
+		crc >>= 16;
+	}
+
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = crc_t10dif_generic(crc, p, tail);
+	}
+
+	return crc & 0xffff;
+}
+
+static int crct10dif_vpmsum_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = crct10dif_vpmsum(*crc, data, length);
+
+	return 0;
+}
+
+
+static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crcp = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crcp;
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.init		= crct10dif_vpmsum_init,
+	.update		= crct10dif_vpmsum_update,
+	.final		= crct10dif_vpmsum_final,
+	.descsize	= CRC_T10DIF_DIGEST_SIZE,
+	.digestsize	= CRC_T10DIF_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crct10dif",
+		.cra_driver_name	= "crct10dif-vpmsum",
+		.cra_priority		= 200,
+		.cra_blocksize		= CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+};
+
+static int __init crct10dif_vpmsum_mod_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_vpmsum_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
+module_exit(crct10dif_vpmsum_mod_fini);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index a916c4a61165..5f6a5af9c489 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -65,7 +65,6 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
-#define CONCAT(a,b)	a##b
 #define VMOVDQ		vmovdqu
 
 #define xdata0		%xmm0
@@ -92,8 +91,6 @@
 #define num_bytes	%r8
 
 #define tmp		%r10
-#define	DDQ(i)		CONCAT(ddq_add_,i)
-#define	XMM(i)		CONCAT(%xmm, i)
 #define	DDQ_DATA	0
 #define	XDATA		1
 #define KEY_128		1
@@ -131,12 +128,12 @@ ddq_add_8:
 /* generate a unique variable for ddq_add_x */
 
 .macro setddq n
-	var_ddq_add = DDQ(\n)
+	var_ddq_add = ddq_add_\n
 .endm
 
 /* generate a unique variable for xmm register */
 .macro setxdata n
-	var_xdata = XMM(\n)
+	var_xdata = %xmm\n
 .endm
 
 /* club the numeric 'id' to the symbol 'name' */
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index aa76cad9d262..af4840ab2a3d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 260a060d7275..24ac9fad832d 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
@@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
 	le128 ivblk = *iv;
 
 	/* generate next IV */
-	le128_gf128mul_x_ble(iv, &ivblk);
+	gf128mul_x_ble(iv, &ivblk);
 
 	/* CC <- T xor C */
 	u128_xor(dst, src, (u128 *)&ivblk);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 644f97ab8cac..ac0e831943f5 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,
@@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 2ebb5e9789f3..243e90a4b5d9 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 29e53ea7d764..ed8b66de541f 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i)
 	i->b = cpu_to_le64(b);
 }
 
-static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
-{
-	u64 a = le64_to_cpu(src->a);
-	u64 b = le64_to_cpu(src->b);
-	u64 _tt = ((s64)a >> 63) & 0x87;
-
-	dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
-	dst->b = cpu_to_le64((b << 1) ^ _tt);
-}
-
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 				 struct blkcipher_desc *desc,
 				 struct scatterlist *dst,
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-03 01:53:46 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-03 01:53:46 +0300
commit	5a0387a8a8efb90ae7fea1e2e5c62de3efa74691 (patch)
tree	9e5bbbafe7fea01c843d86c7c3d40f29f962c474 /arch
parent	204f144c9fcac355843412b6ba1150086488a208 (diff)
parent	929562b144783b9212625305eadcbbd800809643 (diff)
download	linux-5a0387a8a8efb90ae7fea1e2e5c62de3efa74691.tar.xz