From 474fd6e80fe529e9adeeb7ea9d4e5d6c4da0b7fe Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 23 Aug 2016 13:30:24 +0200
Subject: RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster:

raid6: vx128x8  gen() 19705 MB/s
raid6: vx128x8  xor() 11886 MB/s
raid6: using algorithm vx128x8 gen() 19705 MB/s
raid6: .... xor() 11886 MB/s, rmw enabled

vs the software algorithms:

raid6: int64x1  gen()  3018 MB/s
raid6: int64x1  xor()  1429 MB/s
raid6: int64x2  gen()  4661 MB/s
raid6: int64x2  xor()  3143 MB/s
raid6: int64x4  gen()  5392 MB/s
raid6: int64x4  xor()  3509 MB/s
raid6: int64x8  gen()  4441 MB/s
raid6: int64x8  xor()  3207 MB/s
raid6: using algorithm int64x4 gen() 5392 MB/s
raid6: .... xor() 3509 MB/s, rmw enabled

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 lib/raid6/algos.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'lib/raid6/algos.c')

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..e1923b602bbc 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -68,6 +68,9 @@ const struct raid6_calls * const raid6_algos[] = {
 #endif
 #if defined(CONFIG_TILEGX)
 	&raid6_tilegx8,
+#endif
+#if defined(CONFIG_S390)
+	&raid6_s390vx8,
 #endif
 	&raid6_intx1,
 	&raid6_intx2,
-- 
cgit v1.2.3


From f5b55fa1f81d518925d68b50d2316850c525d1ad Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 31 Aug 2016 09:27:35 +0200
Subject: RAID/s390: provide raid6 recovery optimization

The XC instruction can be used to improve the speed of the raid6
recovery. The loops now operate on blocks of 256 bytes.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/linux/raid/pq.h  |   1 +
 lib/raid6/Makefile       |   2 +-
 lib/raid6/algos.c        |   3 ++
 lib/raid6/recov_s390xc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 lib/raid6/recov_s390xc.c

(limited to 'lib/raid6/algos.c')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index c032a6a408a6..395a4c674168 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -116,6 +116,7 @@ struct raid6_recov_calls {
 extern const struct raid6_recov_calls raid6_recov_intx1;
 extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
+extern const struct raid6_recov_calls raid6_recov_s390xc;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 667b9607f8c0..29f503ebfd60 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -7,7 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
-raid6_pq-$(CONFIG_S390) += s390vx8.o
+raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 
 hostprogs-y	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index e1923b602bbc..592ff49df47d 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -97,6 +97,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #endif
 #ifdef CONFIG_AS_SSSE3
 	&raid6_recov_ssse3,
+#endif
+#ifdef CONFIG_S390
+	&raid6_recov_s390xc,
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
new file mode 100644
index 000000000000..b042dac826cc
--- /dev/null
+++ b/lib/raid6/recov_s390xc.c
@@ -0,0 +1,116 @@
+/*
+ * RAID-6 data recovery in dual failure mode based on the XC instruction.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/raid/pq.h>
+
+static inline void xor_block(u8 *p1, u8 *p2)
+{
+	typedef struct { u8 _[256]; } addrtype;
+
+	asm volatile(
+		"	xc	0(256,%[p1]),0(%[p2])\n"
+		: "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2),
+		  [p1] "a" (p1), [p2] "a" (p2) : "cc");
+}
+
+/* Recover two failed data blocks. */
+static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	int i;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+	/* Now do it... */
+	while (bytes) {
+		xor_block(dp, p);
+		xor_block(dq, q);
+		for (i = 0; i < 256; i++)
+			dq[i] = pbmul[dp[i]] ^ qmul[dq[i]];
+		xor_block(dp, dq);
+		p += 256;
+		q += 256;
+		dp += 256;
+		dq += 256;
+		bytes -= 256;
+	}
+}
+
+/* Recover failure of one data block plus the P block */
+static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	int i;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	/* Now do it... */
+	while (bytes) {
+		xor_block(dq, q);
+		for (i = 0; i < 256; i++)
+			dq[i] = qmul[dq[i]];
+		xor_block(p, dq);
+		p += 256;
+		q += 256;
+		dq += 256;
+		bytes -= 256;
+	}
+}
+
+
+const struct raid6_recov_calls raid6_recov_s390xc = {
+	.data2 = raid6_2data_recov_s390xc,
+	.datap = raid6_datap_recov_s390xc,
+	.valid = NULL,
+	.name = "s390xc",
+	.priority = 1,
+};
-- 
cgit v1.2.3


From e0a491c1296874a1aca51cc68452f12a4d950029 Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:19 -0700
Subject: lib/raid6: Add AVX512 optimized gen_syndrome functions

Optimize RAID6 gen_syndrom functions to take advantage of
the 512-bit ZMM integer instructions introduced in AVX512.

AVX512 optimized gen_syndrom functions, which is simply based
on avx2.c written by Yuanhan Liu and sse2.c written by hpa.

The patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/Makefile       |   5 +-
 include/linux/raid/pq.h |   3 +
 lib/raid6/Makefile      |   2 +-
 lib/raid6/algos.c       |   9 ++
 lib/raid6/avx512.c      | 294 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/raid6/x86.h         |  10 ++
 6 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 lib/raid6/avx512.c

(limited to 'lib/raid6/algos.c')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 830ed391e7ef..2d449337a360 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -163,11 +163,12 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
 sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
 sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index a0118d5929a9..0c529a55b52e 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -102,6 +102,9 @@ extern const struct raid6_calls raid6_altivec8;
 extern const struct raid6_calls raid6_avx2x1;
 extern const struct raid6_calls raid6_avx2x2;
 extern const struct raid6_calls raid6_avx2x4;
+extern const struct raid6_calls raid6_avx512x1;
+extern const struct raid6_calls raid6_avx512x2;
+extern const struct raid6_calls raid6_avx512x4;
 extern const struct raid6_calls raid6_tilegx8;
 
 struct raid6_recov_calls {
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48fa040..8948268d47b4 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..f5f090c52dd9 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x1,
 	&raid6_avx2x2,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+#endif
 #endif
 #if defined(__x86_64__) && !defined(__arch_um__)
 	&raid6_sse2x1,
@@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x2,
 	&raid6_avx2x4,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+	&raid6_avx512x4,
+#endif
 #endif
 #ifdef CONFIG_ALTIVEC
 	&raid6_altivec1,
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
new file mode 100644
index 000000000000..b1188a6e51a6
--- /dev/null
+++ b/lib/raid6/avx512.c
@@ -0,0 +1,294 @@
+/* -*- linux-c -*- --------------------------------------------------------
+ *
+ *   Copyright (C) 2016 Intel Corporation
+ *
+ *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ *   Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
+ *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * AVX512 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static const struct raid6_avx512_constants {
+	u64 x1d[8];
+} raid6_avx512_constants __aligned(512) = {
+	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 64) {
+		asm volatile("prefetchnta %0\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+			     "vmovdqa64 %1,%%zmm6"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
+		for (z = z0-2; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+				     "vmovdqa64 %0,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]));
+		}
+		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm4,%1\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
+			     :
+			     : "m" (p[d]), "m" (q[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x1 = {
+	raid6_avx5121_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x1",
+	1                       /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 AVX512 implementation
+ */
+static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	/* We uniformly assume a single prefetch covers at least 64 bytes */
+	for (d = 0; d < bytes; d += 128) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
+			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
+			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
+		for (z = z0-1; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "prefetchnta %1\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vmovntdq %%zmm4,%2\n\t"
+			     "vmovntdq %%zmm6,%3"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
+			       "m" (q[d+64]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x2 = {
+	raid6_avx5122_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x2",
+	1                       /* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
+		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
+		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
+		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
+		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
+		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
+		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
+		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
+		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 256) {
+		for (z = z0; z >= 0; z--) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "prefetchnta %2\n\t"
+			     "prefetchnta %3\n\t"
+			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
+			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpmovm2b %%k2,%%zmm7\n\t"
+			     "vpmovm2b %%k3,%%zmm13\n\t"
+			     "vpmovm2b %%k4,%%zmm15\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+			     "vmovdqa64 %0,%%zmm5\n\t"
+			     "vmovdqa64 %1,%%zmm7\n\t"
+			     "vmovdqa64 %2,%%zmm13\n\t"
+			     "vmovdqa64 %3,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+			     :
+			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
+			     "vmovntdq %%zmm10,%2\n\t"
+			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
+			     "vmovntdq %%zmm11,%3\n\t"
+			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
+			     "vmovntdq %%zmm4,%4\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm6,%5\n\t"
+			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vmovntdq %%zmm12,%6\n\t"
+			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vmovntdq %%zmm14,%7\n\t"
+			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+			       "m" (q[d+128]), "m" (q[d+192]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x4 = {
+	raid6_avx5124_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x4",
+	1                       /* Has cache hints */
+};
+#endif
+
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index 8fe9d9662abb..834d268a4b05 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void)
 #define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_AVX512F     (9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ    (9*32+17) /* AVX-512 DQ (Double/Quad granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512BW    (9*32+30) /* AVX-512 BW (Byte/Word granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512VL    (9*32+31) /* AVX-512 VL (128/256 Vector Length)
+					   * Extensions
+					   */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
-- 
cgit v1.2.3


From 13c520b2993c9faae6770264d33ff1e1ea4c2ceb Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:20 -0700
Subject: lib/raid6: Add AVX512 optimized recovery functions

Optimize RAID6 recovery functions to take advantage of
the 512-bit ZMM integer instructions introduced in AVX512.

AVX512 optimized recovery functions, which is simply based
on recov_avx2.c written by Jim Kukunas

This patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/linux/raid/pq.h  |   1 +
 lib/raid6/Makefile       |   2 +-
 lib/raid6/algos.c        |   3 +
 lib/raid6/recov_avx512.c | 388 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100644 lib/raid6/recov_avx512.c

(limited to 'lib/raid6/algos.c')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 0c529a55b52e..1abd89584568 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -118,6 +118,7 @@ struct raid6_recov_calls {
 extern const struct raid6_recov_calls raid6_recov_intx1;
 extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
+extern const struct raid6_recov_calls raid6_recov_avx512;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8948268d47b4..cd05ee1fb809 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index f5f090c52dd9..149d947a4fec 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -98,6 +98,9 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
 EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_AS_AVX512
+	&raid6_recov_avx512,
+#endif
 #ifdef CONFIG_AS_AVX2
 	&raid6_recov_avx2,
 #endif
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
new file mode 100644
index 000000000000..625aafa33b61
--- /dev/null
+++ b/lib/raid6/recov_avx512.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
+				     int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/* zmm0 = x0f[16] */
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm9\n\t"
+			     "vmovdqa64 %2, %%zmm0\n\t"
+			     "vmovdqa64 %3, %%zmm8\n\t"
+			     "vpxorq %4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %5, %%zmm9, %%zmm9\n\t"
+			     "vpxorq %6, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %7, %%zmm8, %%zmm8"
+			     :
+			     : "m" (q[0]), "m" (q[64]), "m" (p[0]),
+			       "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
+			       "m" (dp[0]), "m" (dp[64]));
+
+		/*
+		 * 1 = dq[0]  ^ q[0]
+		 * 9 = dq[64] ^ q[64]
+		 * 0 = dp[0]  ^ p[0]
+		 * 8 = dp[64] ^ p[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpsraw $4, %%zmm9, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/*
+		 * 5 = qx[0]
+		 * 15 = qx[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm12, %%zmm13, %%zmm13"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		/*
+		 * 1  = pbmul[px[0]]
+		 * 13 = pbmul[px[64]]
+		 */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm15, %%zmm13, %%zmm13"
+			     :
+			     : );
+
+		/*
+		 * 1 = db = DQ
+		 * 13 = db[64] = DQ[64]
+		 */
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm13,%1\n\t"
+			     "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %%zmm13, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]));
+
+		asm volatile("vmovdqa64 %%zmm0, %0\n\t"
+			     "vmovdqa64 %%zmm8, %1"
+			     :
+			     : "m" (dp[0]), "m" (dp[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dp += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm0\n\t"
+			     "vpxorq %2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %3, %%zmm0, %%zmm0"
+			     :
+			     : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
+
+		/* 1 = dq ^ q;  0 = dp ^ p */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		/*
+		 * 1 = dq ^ q
+		 * 3 = dq ^ p >> 4
+		 */
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/* 5 = qx */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = pbmul[px] */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     /* 1 = db = DQ */
+			     "vmovdqa64 %%zmm1, %0\n\t"
+			     :
+			     : "m" (dq[0]));
+
+		asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vmovdqa64 %%zmm0, %0"
+			     :
+			     : "m" (dp[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
+				     void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vmovdqa64 %1, %%zmm8\n\t"
+			     "vpxorq %2, %%zmm3, %%zmm3\n\t"
+			     "vpxorq %3, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
+			       "m" (q[64]));
+
+		/*
+		 * 3 = q[0] ^ dq[0]
+		 * 8 = q[64] ^ dq[64]
+		 */
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vmovapd %%zmm0, %%zmm13\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vmovapd %%zmm1, %%zmm14"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm13, %%zmm14, %%zmm14"
+			     :
+			     : );
+
+		/*
+		 * 1  = qmul[q[0]  ^ dq[0]]
+		 * 14 = qmul[q[64] ^ dq[64]]
+		 */
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vmovdqa64 %1, %%zmm12\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
+			     "vpxorq %%zmm14, %%zmm12, %%zmm12"
+			     :
+			     : "m" (p[0]), "m" (p[64]));
+
+		/*
+		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
+		 * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
+		 */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm14, %1\n\t"
+			     "vmovdqa64 %%zmm2, %2\n\t"
+			     "vmovdqa64 %%zmm12,%3"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
+			       "m" (p[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vpxorq %1, %%zmm3, %%zmm3"
+			     :
+			     : "m" (dq[0]), "m" (q[0]));
+
+		/* 3 = q ^ dq */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2"
+			     :
+			     : "m" (p[0]));
+
+		/* 2 = p ^ qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm2, %1"
+			     :
+			     : "m" (dq[0]), "m" (p[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx512 = {
+	.data2 = raid6_2data_recov_avx512,
+	.datap = raid6_datap_recov_avx512,
+	.valid = raid6_has_avx512,
+#ifdef CONFIG_X86_64
+	.name = "avx512x2",
+#else
+	.name = "avx512x1",
+#endif
+	.priority = 3,
+};
+
+#else
+#warning "your version of binutils lacks AVX512 support"
+#endif
-- 
cgit v1.2.3