From 1a445e8efaa4334457c4d1f48a5d1d829b503f0c Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Fri, 13 May 2016 14:02:00 +0200
Subject: crypto: sha-ssse3 - add MODULE_ALIAS

Add the MODULE_ALIAS for the cra_driver_name of the different ciphers to
allow an automated loading if a driver name is used.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1_ssse3_glue.c   |  6 ++++++
 arch/x86/crypto/sha256_ssse3_glue.c | 10 ++++++++++
 arch/x86/crypto/sha512_ssse3_glue.c |  6 ++++++
 3 files changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 1024e378a358..fc61739150e7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -374,3 +374,9 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-ssse3");
+MODULE_ALIAS_CRYPTO("sha1-avx");
+MODULE_ALIAS_CRYPTO("sha1-avx2");
+#ifdef CONFIG_AS_SHA1_NI
+MODULE_ALIAS_CRYPTO("sha1-ni");
+#endif
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 3ae0f43ebd37..9e79baf03a4b 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -427,4 +427,14 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha256-ssse3");
+MODULE_ALIAS_CRYPTO("sha256-avx");
+MODULE_ALIAS_CRYPTO("sha256-avx2");
 MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha224-ssse3");
+MODULE_ALIAS_CRYPTO("sha224-avx");
+MODULE_ALIAS_CRYPTO("sha224-avx2");
+#ifdef CONFIG_AS_SHA256_NI
+MODULE_ALIAS_CRYPTO("sha256-ni");
+MODULE_ALIAS_CRYPTO("sha224-ni");
+#endif
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b17c83d027d..2b0e2a6825f3 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -346,4 +346,10 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha512-ssse3");
+MODULE_ALIAS_CRYPTO("sha512-avx");
+MODULE_ALIAS_CRYPTO("sha512-avx2");
 MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha384-ssse3");
+MODULE_ALIAS_CRYPTO("sha384-avx");
+MODULE_ALIAS_CRYPTO("sha384-avx2");
-- 
cgit v1.2.3


From 0a7f330c12f2b3cab53b396d4dc369c2dcf272eb Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Tue, 31 May 2016 14:42:20 -0700
Subject: crypto: sha1-mb - stylistic cleanup

Currently there are several checkpatch warnings in the sha1_mb.c file:
'WARNING: line over 80 characters' in the sha1_mb.c file. Also, the
syntax of some multi-line comments are not correct. This patch fixes
these issues.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 110 ++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 9c5af331a956..0a464919542c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -77,7 +77,8 @@ struct sha1_mb_ctx {
 	struct mcryptd_ahash *mcryptd_tfm;
 };
 
-static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
 {
 	struct shash_desc *desc;
 
@@ -85,7 +86,8 @@ static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct s
 	return container_of(desc, struct mcryptd_hash_request_ctx, desc);
 }
 
-static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
 {
 	return container_of((void *) ctx, struct ahash_request, __ctx);
 }
@@ -97,10 +99,12 @@ static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
 }
 
 static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state,
-							  struct job_sha1 *job);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
+			(struct sha1_mb_mgr *state, struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
+						(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
+						(struct sha1_mb_mgr *state);
 
 static inline void sha1_init_digest(uint32_t *digest)
 {
@@ -131,7 +135,8 @@ static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
 	return i >> SHA1_LOG2_BLOCK_SIZE;
 }
 
-static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx)
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
+						struct sha1_hash_ctx *ctx)
 {
 	while (ctx) {
 		if (ctx->status & HASH_CTX_STS_COMPLETE) {
@@ -177,8 +182,8 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 
 				ctx->job.buffer = (uint8_t *) buffer;
 				ctx->job.len = len;
-				ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr,
-										  &ctx->job);
+				ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
+										&ctx->job);
 				continue;
 			}
 		}
@@ -191,13 +196,15 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 		if (ctx->status & HASH_CTX_STS_LAST) {
 
 			uint8_t *buf = ctx->partial_block_buffer;
-			uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length);
+			uint32_t n_extra_blocks =
+					sha1_pad(buf, ctx->total_length);
 
 			ctx->status = (HASH_CTX_STS_PROCESSING |
 				       HASH_CTX_STS_COMPLETE);
 			ctx->job.buffer = buf;
 			ctx->job.len = (uint32_t) n_extra_blocks;
-			ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
 			continue;
 		}
 
@@ -208,14 +215,17 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 	return NULL;
 }
 
-static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+static struct sha1_hash_ctx
+			*sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
 {
 	/*
 	 * If get_comp_job returns NULL, there are no jobs complete.
-	 * If get_comp_job returns a job, verify that it is safe to return to the user.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user.
 	 * If it is not ready, resubmit the job to finish processing.
 	 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
-	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing.
+	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+	 * still need processing.
 	 */
 	struct sha1_hash_ctx *ctx;
 
@@ -235,7 +245,10 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 					  int flags)
 {
 	if (flags & (~HASH_ENTIRE)) {
-		/* User should not pass anything other than FIRST, UPDATE, or LAST */
+		/*
+		 * User should not pass anything other than FIRST, UPDATE, or
+		 * LAST
+		 */
 		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
 		return ctx;
 	}
@@ -264,14 +277,20 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 		ctx->partial_block_buffer_length = 0;
 	}
 
-	/* If we made it here, there were no errors during this call to submit */
+	/*
+	 * If we made it here, there were no errors during this call to
+	 * submit
+	 */
 	ctx->error = HASH_CTX_ERROR_NONE;
 
 	/* Store buffer ptr info from user */
 	ctx->incoming_buffer = buffer;
 	ctx->incoming_buffer_length = len;
 
-	/* Store the user's request flags and mark this ctx as currently being processed. */
+	/*
+	 * Store the user's request flags and mark this ctx as currently
+	 * being processed.
+	 */
 	ctx->status = (flags & HASH_LAST) ?
 			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
 			HASH_CTX_STS_PROCESSING;
@@ -286,8 +305,12 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 	 * append as much as possible to the extra block.
 	 */
 	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
-		/* Compute how many bytes to copy from user buffer into extra block */
-		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		/*
+		 * Compute how many bytes to copy from user buffer into
+		 * extra block
+		 */
+		uint32_t copy_len = SHA1_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
 		if (len < copy_len)
 			copy_len = len;
 
@@ -297,20 +320,28 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 				buffer, copy_len);
 
 			ctx->partial_block_buffer_length += copy_len;
-			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
 			ctx->incoming_buffer_length = len - copy_len;
 		}
 
-		/* The extra block should never contain more than 1 block here */
+		/*
+		 * The extra block should never contain more than 1 block
+		 * here
+		 */
 		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
 
-		/* If the extra block buffer contains exactly 1 block, it can be hashed. */
+		/*
+		 * If the extra block buffer contains exactly 1 block, it can
+		 * be hashed.
+		 */
 		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
 			ctx->partial_block_buffer_length = 0;
 
 			ctx->job.buffer = ctx->partial_block_buffer;
 			ctx->job.len = 1;
-			ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
 		}
 	}
 
@@ -329,14 +360,15 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
 			return NULL;
 
 		/*
-		 * If flush returned a job, resubmit the job to finish processing.
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
 		 */
 		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
 
 		/*
-		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
-		 * Otherwise, all jobs currently being managed by the sha1_ctx_mgr
-		 * still need processing. Loop.
+		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
+		 * returned. Otherwise, all jobs currently being managed by the
+		 * sha1_ctx_mgr still need processing. Loop.
 		 */
 		if (ctx)
 			return ctx;
@@ -394,9 +426,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 				flag |= HASH_LAST;
 
 		}
-		sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc);
+		sha_ctx = (struct sha1_hash_ctx *)
+						shash_desc_ctx(&rctx->desc);
 		kernel_fpu_begin();
-		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
 		if (!sha_ctx) {
 			if (flush)
 				sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
@@ -489,7 +523,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(desc, struct mcryptd_hash_request_ctx, desc);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -521,7 +555,8 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -548,7 +583,7 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *out)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(desc, struct mcryptd_hash_request_ctx, desc);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -584,7 +619,8 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 	sha1_mb_add_list(rctx, cstate);
 
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -608,7 +644,7 @@ done:
 static int sha1_mb_final(struct shash_desc *desc, u8 *out)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(desc, struct mcryptd_hash_request_ctx, desc);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -632,7 +668,8 @@ static int sha1_mb_final(struct shash_desc *desc, u8 *out)
 	/* flag HASH_FINAL and 0 data size */
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -866,7 +903,8 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
 		if (time_before(cur_time, rctx->tag.expire))
 			break;
 		kernel_fpu_begin();
-		sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
+		sha_ctx = (struct sha1_hash_ctx *)
+					sha1_ctx_mgr_flush(cstate->mgr);
 		kernel_fpu_end();
 		if (!sha_ctx) {
 			pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
-- 
cgit v1.2.3


From 7ea0da1d75e0d03f9ec484d2038e3caf06024b17 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Jun 2016 13:19:17 +0800
Subject: crypto: chacha20-simd - Use generic code for small requests

On 16-byte requests the optimised version is actually slower than
the generic code, so we should simply use that instead.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
---
 arch/x86/crypto/chacha20_glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 2d5c2e0bd939..f910d1d449f0 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -70,7 +70,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 	struct blkcipher_walk walk;
 	int err;
 
-	if (!may_use_simd())
+	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha20_crypt(desc, dst, src, nbytes);
 
 	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
-- 
cgit v1.2.3


From 38b2f68b426429c06cdf2ae5c8a89371524db203 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 21 Jun 2016 16:55:14 +0800
Subject: crypto: aesni - Fix cryptd reordering problem on gcm

This patch fixes an old bug where gcm requests can be reordered
because some are processed by cryptd while others are processed
directly in softirq context.

The fix is to always postpone to cryptd if there are currently
requests outstanding from the same tfm.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 5b7fa1471007..9e15572ef06d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1098,9 +1098,12 @@ static int rfc4106_encrypt(struct aead_request *req)
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	aead_request_set_tfm(req, irq_fpu_usable() ?
-				  cryptd_aead_child(cryptd_tfm) :
-				  &cryptd_tfm->base);
+	tfm = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		tfm = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, tfm);
 
 	return crypto_aead_encrypt(req);
 }
@@ -1111,9 +1114,12 @@ static int rfc4106_decrypt(struct aead_request *req)
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	aead_request_set_tfm(req, irq_fpu_usable() ?
-				  cryptd_aead_child(cryptd_tfm) :
-				  &cryptd_tfm->base);
+	tfm = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		tfm = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, tfm);
 
 	return crypto_aead_decrypt(req);
 }
-- 
cgit v1.2.3


From 7271b33cb87e80f3a416fb031ad3ca87f0bea80a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 21 Jun 2016 16:55:16 +0800
Subject: crypto: ghash-clmulni - Fix cryptd reordering

This patch fixes an old bug where requests can be reordered because
some are processed by cryptd while others are processed directly
in softirq context.

The fix is to always postpone to cryptd if there are currently
requests outstanding from the same tfm.

This patch also removes the redundant use of cryptd in the async
init function as init never touches the FPU.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/ghash-clmulni-intel_glue.c | 40 +++++++++++++-----------------
 1 file changed, 17 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index a69321a77783..0420bab19efb 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -168,30 +168,23 @@ static int ghash_async_init(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
-	if (!irq_fpu_usable()) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
-		return crypto_ahash_init(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
-		desc->tfm = child;
-		desc->flags = req->base.flags;
-		return crypto_shash_init(desc);
-	}
+	desc->tfm = child;
+	desc->flags = req->base.flags;
+	return crypto_shash_init(desc);
 }
 
 static int ghash_async_update(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_update(cryptd_req);
@@ -204,12 +197,12 @@ static int ghash_async_update(struct ahash_request *req)
 static int ghash_async_final(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_final(cryptd_req);
@@ -249,7 +242,8 @@ static int ghash_async_digest(struct ahash_request *req)
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_digest(cryptd_req);
-- 
cgit v1.2.3


From 331bf739c4f9992a73547d20bd8f2378b97d386a Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Tue, 21 Jun 2016 18:21:46 -0700
Subject: crypto: sha1-mb - async implementation for sha1-mb

Herbert wants the sha1-mb algorithm to have an async implementation:
https://lkml.org/lkml/2016/4/5/286.
Currently, sha1-mb uses an async interface for the outer algorithm
and a sync interface for the inner algorithm. This patch introduces
a async interface for even the inner algorithm.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 182 ++++++++++++++++++++++-----------------
 crypto/mcryptd.c                 | 132 ++++++++++++----------------
 include/crypto/internal/hash.h   |  12 +--
 include/crypto/mcryptd.h         |   8 +-
 4 files changed, 165 insertions(+), 169 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 0a464919542c..669cc37268e1 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -80,10 +80,10 @@ struct sha1_mb_ctx {
 static inline struct mcryptd_hash_request_ctx
 		*cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
 {
-	struct shash_desc *desc;
+	struct ahash_request *areq;
 
-	desc = container_of((void *) hash_ctx, struct shash_desc, __ctx);
-	return container_of(desc, struct mcryptd_hash_request_ctx, desc);
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
 }
 
 static inline struct ahash_request
@@ -93,7 +93,7 @@ static inline struct ahash_request
 }
 
 static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
-				struct shash_desc *desc)
+				struct ahash_request *areq)
 {
 	rctx->flag = HASH_UPDATE;
 }
@@ -375,9 +375,9 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
 	}
 }
 
-static int sha1_mb_init(struct shash_desc *desc)
+static int sha1_mb_init(struct ahash_request *areq)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	hash_ctx_init(sctx);
 	sctx->job.result_digest[0] = SHA1_H0;
@@ -395,7 +395,7 @@ static int sha1_mb_init(struct shash_desc *desc)
 static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
 {
 	int	i;
-	struct	sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc);
+	struct	sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
 	__be32	*dst = (__be32 *) rctx->out;
 
 	for (i = 0; i < 5; ++i)
@@ -427,7 +427,7 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 
 		}
 		sha_ctx = (struct sha1_hash_ctx *)
-						shash_desc_ctx(&rctx->desc);
+						ahash_request_ctx(&rctx->areq);
 		kernel_fpu_begin();
 		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
 						rctx->walk.data, nbytes, flag);
@@ -519,11 +519,10 @@ static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 	mcryptd_arm_flusher(cstate, delay);
 }
 
-static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
+static int sha1_mb_update(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-		container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -539,7 +538,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
 	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
 
@@ -552,7 +551,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 		rctx->flag |= HASH_DONE;
 
 	/* submit */
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
 	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
@@ -579,11 +578,10 @@ done:
 	return ret;
 }
 
-static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
+static int sha1_mb_finup(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-		container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -598,7 +596,7 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
 	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
 
@@ -611,11 +609,10 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 		rctx->flag |= HASH_DONE;
 		flag = HASH_LAST;
 	}
-	rctx->out = out;
 
 	/* submit */
 	rctx->flag |= HASH_FINAL;
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	sha1_mb_add_list(rctx, cstate);
 
 	kernel_fpu_begin();
@@ -641,10 +638,10 @@ done:
 	return ret;
 }
 
-static int sha1_mb_final(struct shash_desc *desc, u8 *out)
+static int sha1_mb_final(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-		container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -659,12 +656,11 @@ static int sha1_mb_final(struct shash_desc *desc, u8 *out)
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
-	rctx->out = out;
 	rctx->flag |= HASH_DONE | HASH_FINAL;
 
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	/* flag HASH_FINAL and 0 data size */
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
@@ -691,48 +687,98 @@ done:
 	return ret;
 }
 
-static int sha1_mb_export(struct shash_desc *desc, void *out)
+static int sha1_mb_export(struct ahash_request *areq, void *out)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	memcpy(out, sctx, sizeof(*sctx));
 
 	return 0;
 }
 
-static int sha1_mb_import(struct shash_desc *desc, const void *in)
+static int sha1_mb_import(struct ahash_request *areq, const void *in)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	memcpy(sctx, in, sizeof(*sctx));
 
 	return 0;
 }
 
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
 
-static struct shash_alg sha1_mb_shash_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha1_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha1_hash_ctx));
+
+	return 0;
+}
+
+static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_areq_alg = {
 	.init		=	sha1_mb_init,
 	.update		=	sha1_mb_update,
 	.final		=	sha1_mb_final,
 	.finup		=	sha1_mb_finup,
 	.export		=	sha1_mb_export,
 	.import		=	sha1_mb_import,
-	.descsize	=	sizeof(struct sha1_hash_ctx),
-	.statesize	=	sizeof(struct sha1_hash_ctx),
-	.base		=	{
-		.cra_name	 = "__sha1-mb",
-		.cra_driver_name = "__intel_sha1-mb",
-		.cra_priority	 = 100,
-		/*
-		 * use ASYNC flag as some buffers in multi-buffer
-		 * algo may not have completed before hashing thread sleep
-		 */
-		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
-				   CRYPTO_ALG_INTERNAL,
-		.cra_blocksize	 = SHA1_BLOCK_SIZE,
-		.cra_module	 = THIS_MODULE,
-		.cra_list	 = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
+	.halg		=	{
+		.digestsize	=	SHA1_DIGEST_SIZE,
+		.statesize	=	sizeof(struct sha1_hash_ctx),
+		.base		=	{
+			.cra_name	 = "__sha1-mb",
+			.cra_driver_name = "__intel_sha1-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA1_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha1_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha1_mb_areq_init_tfm,
+			.cra_exit	= sha1_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha1_hash_ctx),
+		}
 	}
 };
 
@@ -817,46 +863,20 @@ static int sha1_mb_async_import(struct ahash_request *req, const void *in)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-	struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
 	struct mcryptd_hash_request_ctx *rctx;
-	struct shash_desc *desc;
+	struct ahash_request *areq;
 
 	memcpy(mcryptd_req, req, sizeof(*req));
 	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
 	rctx = ahash_request_ctx(mcryptd_req);
-	desc = &rctx->desc;
-	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	return crypto_ahash_import(mcryptd_req, in);
-}
-
-static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
-	struct mcryptd_ahash *mcryptd_tfm;
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct mcryptd_hash_ctx *mctx;
+	areq = &rctx->areq;
 
-	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
-					  CRYPTO_ALG_INTERNAL,
-					  CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(mcryptd_tfm))
-		return PTR_ERR(mcryptd_tfm);
-	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
-	mctx->alg_state = &sha1_mb_alg_state;
-	ctx->mcryptd_tfm = mcryptd_tfm;
-	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				 sizeof(struct ahash_request) +
-				 crypto_ahash_reqsize(&mcryptd_tfm->base));
-
-	return 0;
-}
-
-static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
 
-	mcryptd_free_ahash(ctx->mcryptd_tfm);
+	return crypto_ahash_import(mcryptd_req, in);
 }
 
 static struct ahash_alg sha1_mb_async_alg = {
@@ -965,7 +985,7 @@ static int __init sha1_mb_mod_init(void)
 	}
 	sha1_mb_alg_state.flusher = &sha1_mb_flusher;
 
-	err = crypto_register_shash(&sha1_mb_shash_alg);
+	err = crypto_register_ahash(&sha1_mb_areq_alg);
 	if (err)
 		goto err2;
 	err = crypto_register_ahash(&sha1_mb_async_alg);
@@ -975,7 +995,7 @@ static int __init sha1_mb_mod_init(void)
 
 	return 0;
 err1:
-	crypto_unregister_shash(&sha1_mb_shash_alg);
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
 err2:
 	for_each_possible_cpu(cpu) {
 		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
@@ -991,7 +1011,7 @@ static void __exit sha1_mb_mod_fini(void)
 	struct mcryptd_alg_cstate *cpu_state;
 
 	crypto_unregister_ahash(&sha1_mb_async_alg);
-	crypto_unregister_shash(&sha1_mb_shash_alg);
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
 	for_each_possible_cpu(cpu) {
 		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
 		kfree(cpu_state->mgr);
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index c4eb9da49d4f..86fb59b109a9 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -41,7 +41,7 @@ struct mcryptd_flush_list {
 static struct mcryptd_flush_list __percpu *mcryptd_flist;
 
 struct hashd_instance_ctx {
-	struct crypto_shash_spawn spawn;
+	struct crypto_ahash_spawn spawn;
 	struct mcryptd_queue *queue;
 };
 
@@ -272,18 +272,18 @@ static int mcryptd_hash_init_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
 	struct hashd_instance_ctx *ictx = crypto_instance_ctx(inst);
-	struct crypto_shash_spawn *spawn = &ictx->spawn;
+	struct crypto_ahash_spawn *spawn = &ictx->spawn;
 	struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct crypto_shash *hash;
+	struct crypto_ahash *hash;
 
-	hash = crypto_spawn_shash(spawn);
+	hash = crypto_spawn_ahash(spawn);
 	if (IS_ERR(hash))
 		return PTR_ERR(hash);
 
 	ctx->child = hash;
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct mcryptd_hash_request_ctx) +
-				 crypto_shash_descsize(hash));
+				 crypto_ahash_reqsize(hash));
 	return 0;
 }
 
@@ -291,21 +291,21 @@ static void mcryptd_hash_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	crypto_free_shash(ctx->child);
+	crypto_free_ahash(ctx->child);
 }
 
 static int mcryptd_hash_setkey(struct crypto_ahash *parent,
 				   const u8 *key, unsigned int keylen)
 {
 	struct mcryptd_hash_ctx *ctx   = crypto_ahash_ctx(parent);
-	struct crypto_shash *child = ctx->child;
+	struct crypto_ahash *child = ctx->child;
 	int err;
 
-	crypto_shash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
-	crypto_shash_set_flags(child, crypto_ahash_get_flags(parent) &
+	crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ahash_set_flags(child, crypto_ahash_get_flags(parent) &
 				      CRYPTO_TFM_REQ_MASK);
-	err = crypto_shash_setkey(child, key, keylen);
-	crypto_ahash_set_flags(parent, crypto_shash_get_flags(child) &
+	err = crypto_ahash_setkey(child, key, keylen);
+	crypto_ahash_set_flags(parent, crypto_ahash_get_flags(child) &
 				       CRYPTO_TFM_RES_MASK);
 	return err;
 }
@@ -331,20 +331,20 @@ static int mcryptd_hash_enqueue(struct ahash_request *req,
 static void mcryptd_hash_init(struct crypto_async_request *req_async, int err)
 {
 	struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm);
-	struct crypto_shash *child = ctx->child;
+	struct crypto_ahash *child = ctx->child;
 	struct ahash_request *req = ahash_request_cast(req_async);
 	struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
-	struct shash_desc *desc = &rctx->desc;
+	struct ahash_request *desc = &rctx->areq;
 
 	if (unlikely(err == -EINPROGRESS))
 		goto out;
 
-	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_set_tfm(desc, child);
+	ahash_request_set_callback(desc, CRYPTO_TFM_REQ_MAY_SLEEP,
+						rctx->complete, req_async);
 
-	err = crypto_shash_init(desc);
-
-	req->base.complete = rctx->complete;
+	rctx->out = req->result;
+	err = crypto_ahash_init(desc);
 
 out:
 	local_bh_disable();
@@ -365,7 +365,8 @@ static void mcryptd_hash_update(struct crypto_async_request *req_async, int err)
 	if (unlikely(err == -EINPROGRESS))
 		goto out;
 
-	err = shash_ahash_mcryptd_update(req, &rctx->desc);
+	rctx->out = req->result;
+	err = ahash_mcryptd_update(&rctx->areq);
 	if (err) {
 		req->base.complete = rctx->complete;
 		goto out;
@@ -391,7 +392,8 @@ static void mcryptd_hash_final(struct crypto_async_request *req_async, int err)
 	if (unlikely(err == -EINPROGRESS))
 		goto out;
 
-	err = shash_ahash_mcryptd_final(req, &rctx->desc);
+	rctx->out = req->result;
+	err = ahash_mcryptd_final(&rctx->areq);
 	if (err) {
 		req->base.complete = rctx->complete;
 		goto out;
@@ -416,8 +418,8 @@ static void mcryptd_hash_finup(struct crypto_async_request *req_async, int err)
 
 	if (unlikely(err == -EINPROGRESS))
 		goto out;
-
-	err = shash_ahash_mcryptd_finup(req, &rctx->desc);
+	rctx->out = req->result;
+	err = ahash_mcryptd_finup(&rctx->areq);
 
 	if (err) {
 		req->base.complete = rctx->complete;
@@ -439,25 +441,21 @@ static int mcryptd_hash_finup_enqueue(struct ahash_request *req)
 static void mcryptd_hash_digest(struct crypto_async_request *req_async, int err)
 {
 	struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm);
-	struct crypto_shash *child = ctx->child;
+	struct crypto_ahash *child = ctx->child;
 	struct ahash_request *req = ahash_request_cast(req_async);
 	struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
-	struct shash_desc *desc = &rctx->desc;
+	struct ahash_request *desc = &rctx->areq;
 
 	if (unlikely(err == -EINPROGRESS))
 		goto out;
 
-	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;  /* check this again */
-
-	err = shash_ahash_mcryptd_digest(req, desc);
+	ahash_request_set_tfm(desc, child);
+	ahash_request_set_callback(desc, CRYPTO_TFM_REQ_MAY_SLEEP,
+						rctx->complete, req_async);
 
-	if (err) {
-		req->base.complete = rctx->complete;
-		goto out;
-	}
+	rctx->out = req->result;
+	err = ahash_mcryptd_digest(desc);
 
-	return;
 out:
 	local_bh_disable();
 	rctx->complete(&req->base, err);
@@ -473,14 +471,14 @@ static int mcryptd_hash_export(struct ahash_request *req, void *out)
 {
 	struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
 
-	return crypto_shash_export(&rctx->desc, out);
+	return crypto_ahash_export(&rctx->areq, out);
 }
 
 static int mcryptd_hash_import(struct ahash_request *req, const void *in)
 {
 	struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
 
-	return crypto_shash_import(&rctx->desc, in);
+	return crypto_ahash_import(&rctx->areq, in);
 }
 
 static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
@@ -488,7 +486,7 @@ static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
 {
 	struct hashd_instance_ctx *ctx;
 	struct ahash_instance *inst;
-	struct shash_alg *salg;
+	struct hash_alg_common *halg;
 	struct crypto_alg *alg;
 	u32 type = 0;
 	u32 mask = 0;
@@ -496,11 +494,11 @@ static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
 
 	mcryptd_check_internal(tb, &type, &mask);
 
-	salg = shash_attr_alg(tb[1], type, mask);
-	if (IS_ERR(salg))
-		return PTR_ERR(salg);
+	halg = ahash_attr_alg(tb[1], type, mask);
+	if (IS_ERR(halg))
+		return PTR_ERR(halg);
 
-	alg = &salg->base;
+	alg = &halg->base;
 	pr_debug("crypto: mcryptd hash alg: %s\n", alg->cra_name);
 	inst = mcryptd_alloc_instance(alg, ahash_instance_headroom(),
 					sizeof(*ctx));
@@ -511,7 +509,7 @@ static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
 	ctx = ahash_instance_ctx(inst);
 	ctx->queue = queue;
 
-	err = crypto_init_shash_spawn(&ctx->spawn, salg,
+	err = crypto_init_ahash_spawn(&ctx->spawn, halg,
 				      ahash_crypto_instance(inst));
 	if (err)
 		goto out_free_inst;
@@ -521,8 +519,8 @@ static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
 		type |= CRYPTO_ALG_INTERNAL;
 	inst->alg.halg.base.cra_flags = type;
 
-	inst->alg.halg.digestsize = salg->digestsize;
-	inst->alg.halg.statesize = salg->statesize;
+	inst->alg.halg.digestsize = halg->digestsize;
+	inst->alg.halg.statesize = halg->statesize;
 	inst->alg.halg.base.cra_ctxsize = sizeof(struct mcryptd_hash_ctx);
 
 	inst->alg.halg.base.cra_init = mcryptd_hash_init_tfm;
@@ -539,7 +537,7 @@ static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
 
 	err = ahash_register_instance(tmpl, inst);
 	if (err) {
-		crypto_drop_shash(&ctx->spawn);
+		crypto_drop_ahash(&ctx->spawn);
 out_free_inst:
 		kfree(inst);
 	}
@@ -575,7 +573,7 @@ static void mcryptd_free(struct crypto_instance *inst)
 
 	switch (inst->alg.cra_flags & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_AHASH:
-		crypto_drop_shash(&hctx->spawn);
+		crypto_drop_ahash(&hctx->spawn);
 		kfree(ahash_instance(inst));
 		return;
 	default:
@@ -612,55 +610,38 @@ struct mcryptd_ahash *mcryptd_alloc_ahash(const char *alg_name,
 }
 EXPORT_SYMBOL_GPL(mcryptd_alloc_ahash);
 
-int shash_ahash_mcryptd_digest(struct ahash_request *req,
-			       struct shash_desc *desc)
+int ahash_mcryptd_digest(struct ahash_request *desc)
 {
 	int err;
 
-	err = crypto_shash_init(desc) ?:
-	      shash_ahash_mcryptd_finup(req, desc);
+	err = crypto_ahash_init(desc) ?:
+	      ahash_mcryptd_finup(desc);
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(shash_ahash_mcryptd_digest);
 
-int shash_ahash_mcryptd_update(struct ahash_request *req,
-			       struct shash_desc *desc)
+int ahash_mcryptd_update(struct ahash_request *desc)
 {
-	struct crypto_shash *tfm = desc->tfm;
-	struct shash_alg *shash = crypto_shash_alg(tfm);
-
 	/* alignment is to be done by multi-buffer crypto algorithm if needed */
 
-	return shash->update(desc, NULL, 0);
+	return crypto_ahash_update(desc);
 }
-EXPORT_SYMBOL_GPL(shash_ahash_mcryptd_update);
 
-int shash_ahash_mcryptd_finup(struct ahash_request *req,
-			      struct shash_desc *desc)
+int ahash_mcryptd_finup(struct ahash_request *desc)
 {
-	struct crypto_shash *tfm = desc->tfm;
-	struct shash_alg *shash = crypto_shash_alg(tfm);
-
 	/* alignment is to be done by multi-buffer crypto algorithm if needed */
 
-	return shash->finup(desc, NULL, 0, req->result);
+	return crypto_ahash_finup(desc);
 }
-EXPORT_SYMBOL_GPL(shash_ahash_mcryptd_finup);
 
-int shash_ahash_mcryptd_final(struct ahash_request *req,
-			      struct shash_desc *desc)
+int ahash_mcryptd_final(struct ahash_request *desc)
 {
-	struct crypto_shash *tfm = desc->tfm;
-	struct shash_alg *shash = crypto_shash_alg(tfm);
-
 	/* alignment is to be done by multi-buffer crypto algorithm if needed */
 
-	return shash->final(desc, req->result);
+	return crypto_ahash_final(desc);
 }
-EXPORT_SYMBOL_GPL(shash_ahash_mcryptd_final);
 
-struct crypto_shash *mcryptd_ahash_child(struct mcryptd_ahash *tfm)
+struct crypto_ahash *mcryptd_ahash_child(struct mcryptd_ahash *tfm)
 {
 	struct mcryptd_hash_ctx *ctx = crypto_ahash_ctx(&tfm->base);
 
@@ -668,12 +649,12 @@ struct crypto_shash *mcryptd_ahash_child(struct mcryptd_ahash *tfm)
 }
 EXPORT_SYMBOL_GPL(mcryptd_ahash_child);
 
-struct shash_desc *mcryptd_shash_desc(struct ahash_request *req)
+struct ahash_request *mcryptd_ahash_desc(struct ahash_request *req)
 {
 	struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
-	return &rctx->desc;
+	return &rctx->areq;
 }
-EXPORT_SYMBOL_GPL(mcryptd_shash_desc);
+EXPORT_SYMBOL_GPL(mcryptd_ahash_desc);
 
 void mcryptd_free_ahash(struct mcryptd_ahash *tfm)
 {
@@ -681,7 +662,6 @@ void mcryptd_free_ahash(struct mcryptd_ahash *tfm)
 }
 EXPORT_SYMBOL_GPL(mcryptd_free_ahash);
 
-
 static int __init mcryptd_init(void)
 {
 	int err, cpu;
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 49dae16f8929..1d4f365d8f03 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -114,14 +114,10 @@ int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc);
 int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc);
 int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc);
 
-int shash_ahash_mcryptd_update(struct ahash_request *req,
-			       struct shash_desc *desc);
-int shash_ahash_mcryptd_final(struct ahash_request *req,
-			      struct shash_desc *desc);
-int shash_ahash_mcryptd_finup(struct ahash_request *req,
-			      struct shash_desc *desc);
-int shash_ahash_mcryptd_digest(struct ahash_request *req,
-			       struct shash_desc *desc);
+int ahash_mcryptd_update(struct ahash_request *desc);
+int ahash_mcryptd_final(struct ahash_request *desc);
+int ahash_mcryptd_finup(struct ahash_request *desc);
+int ahash_mcryptd_digest(struct ahash_request *desc);
 
 int crypto_init_shash_ops_async(struct crypto_tfm *tfm);
 
diff --git a/include/crypto/mcryptd.h b/include/crypto/mcryptd.h
index c23ee1f7ee80..4a53c0d38cd2 100644
--- a/include/crypto/mcryptd.h
+++ b/include/crypto/mcryptd.h
@@ -39,7 +39,7 @@ struct mcryptd_instance_ctx {
 };
 
 struct mcryptd_hash_ctx {
-	struct crypto_shash *child;
+	struct crypto_ahash *child;
 	struct mcryptd_alg_state *alg_state;
 };
 
@@ -59,13 +59,13 @@ struct mcryptd_hash_request_ctx {
 	struct crypto_hash_walk walk;
 	u8 *out;
 	int flag;
-	struct shash_desc desc;
+	struct ahash_request areq;
 };
 
 struct mcryptd_ahash *mcryptd_alloc_ahash(const char *alg_name,
 					u32 type, u32 mask);
-struct crypto_shash *mcryptd_ahash_child(struct mcryptd_ahash *tfm);
-struct shash_desc *mcryptd_shash_desc(struct ahash_request *req);
+struct crypto_ahash *mcryptd_ahash_child(struct mcryptd_ahash *tfm);
+struct ahash_request *mcryptd_ahash_desc(struct ahash_request *req);
 void mcryptd_free_ahash(struct mcryptd_ahash *tfm);
 void mcryptd_flusher(struct work_struct *work);
 
-- 
cgit v1.2.3


From f876f440df3973ab7f1d20e3d34d000fc9422a78 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Thu, 23 Jun 2016 18:40:42 -0700
Subject: crypto: sha256-mb - SHA256 multibuffer job manager and glue code

This patch introduces the multi-buffer job manager which is responsible for
submitting scatter-gather buffers from several SHA256 jobs to the
multi-buffer algorithm. It also contains the flush routine to that's
called by the crypto daemon to complete the job when no new jobs arrive
before the deadline of maximum latency of a SHA256 crypto job.

The SHA256 multi-buffer crypto algorithm is defined and initialized in
this patch.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile              |    1 +
 arch/x86/crypto/sha256-mb/Makefile    |   11 +
 arch/x86/crypto/sha256-mb/sha256_mb.c | 1027 +++++++++++++++++++++++++++++++++
 3 files changed, 1039 insertions(+)
 create mode 100644 arch/x86/crypto/sha256-mb/Makefile
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb.c

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b9b912a44d61..aafff22b0ea6 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -50,6 +50,7 @@ ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
 	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
+	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
new file mode 100644
index 000000000000..41089e7c400c
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
+	sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
+	     sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
new file mode 100644
index 000000000000..c9d5dcc81c96
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -0,0 +1,1027 @@
+/*
+ * Multi buffer SHA256 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha256_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha256_mb_alg_state;
+
+struct sha256_mb_ctx {
+	struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx)
+{
+	struct ahash_request *areq;
+
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+	return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+				struct ahash_request *areq)
+{
+	rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_submit)
+			(struct sha256_mb_mgr *state, struct job_sha256 *job);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_flush)
+			(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job)
+			(struct sha256_mb_mgr *state);
+
+inline void sha256_init_digest(uint32_t *digest)
+{
+	static const uint32_t initial_digest[SHA256_DIGEST_LENGTH] = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7};
+	memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
+			 uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+	memset(&padblock[i], 0, SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((SHA256_BLOCK_SIZE - 1) &
+	      (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1)))
+	     + 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+	/* Number of extra blocks to hash */
+	return i >> SHA256_LOG2_BLOCK_SIZE;
+}
+
+static struct sha256_hash_ctx
+		*sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr,
+					struct sha256_hash_ctx *ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			/* Clear PROCESSING bit */
+			ctx->status = HASH_CTX_STS_COMPLETE;
+			return ctx;
+		}
+
+		/*
+		 * If the extra blocks are empty, begin hashing what remains
+		 * in the user's buffer.
+		 */
+		if (ctx->partial_block_buffer_length == 0 &&
+		    ctx->incoming_buffer_length) {
+
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+			uint32_t copy_len;
+
+			/*
+			 * Only entire blocks can be hashed.
+			 * Copy remainder to extra blocks buffer.
+			 */
+			copy_len = len & (SHA256_BLOCK_SIZE-1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy(ctx->partial_block_buffer,
+				       ((const char *) buffer + len),
+				       copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			/* len should be a multiple of the block size now */
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			/* Set len to the number of blocks to be hashed */
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+				continue;
+			}
+		}
+
+		/*
+		 * If the extra blocks are not empty, then we are
+		 * either on the last block(s) or we need more
+		 * user input before continuing.
+		 */
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks =
+				sha256_pad(buf, ctx->total_length);
+
+			ctx->status = (HASH_CTX_STS_PROCESSING |
+				       HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static struct sha256_hash_ctx
+		*sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr)
+{
+	/*
+	 * If get_comp_job returns NULL, there are no jobs complete.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user. If it is not ready, resubmit the job to finish processing.
+	 * If sha256_ctx_mgr_resubmit returned a job, it is ready to be
+	 * returned. Otherwise, all jobs currently being managed by the
+	 * hash_ctx_mgr still need processing.
+	 */
+	struct sha256_hash_ctx *ctx;
+
+	ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr);
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr)
+{
+	sha256_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
+					  struct sha256_hash_ctx *ctx,
+					  const void *buffer,
+					  uint32_t len,
+					  int flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		/* User should not pass anything other than FIRST, UPDATE
+		 * or LAST
+		 */
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		/* Cannot submit to a currently processing job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		/* Cannot update a finished job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		/* Init digest */
+		sha256_init_digest(ctx->job.result_digest);
+
+		/* Reset byte counter */
+		ctx->total_length = 0;
+
+		/* Clear extra blocks */
+		ctx->partial_block_buffer_length = 0;
+	}
+
+	/* If we made it here, there was no error during this call to submit */
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	/* Store buffer ptr info from user */
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	/* Store the user's request flags and mark this ctx as currently
+	 * being processed.
+	 */
+	ctx->status = (flags & HASH_LAST) ?
+			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+			HASH_CTX_STS_PROCESSING;
+
+	/* Advance byte counter */
+	ctx->total_length += len;
+
+	/*
+	 * If there is anything currently buffered in the extra blocks,
+	 * append to it until it contains a whole block.
+	 * Or if the user's buffer contains less than a whole block,
+	 * append as much as possible to the extra block.
+	 */
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		/* Compute how many bytes to copy from user buffer into
+		 * extra block
+		 */
+		uint32_t copy_len = SHA256_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			/* Copy and update relevant pointers and counters */
+			memcpy(
+		&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+				buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+
+		/* The extra block should never contain more than 1 block */
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		/* If the extra block buffer contains exactly 1 block,
+		 * it can be hashed.
+		 */
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr)
+{
+	struct sha256_hash_ctx *ctx;
+
+	while (1) {
+		ctx = (struct sha256_hash_ctx *)
+					sha256_job_mgr_flush(&mgr->mgr);
+
+		/* If flush returned 0, there are no more jobs in flight. */
+		if (!ctx)
+			return NULL;
+
+		/*
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
+		 */
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		/*
+		 * If sha256_ctx_mgr_resubmit returned a job, it is ready to
+		 * be returned. Otherwise, all jobs currently being managed by
+		 * the sha256_ctx_mgr still need processing. Loop.
+		 */
+		if (ctx)
+			return ctx;
+	}
+}
+
+static int sha256_mb_init(struct ahash_request *areq)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	hash_ctx_init(sctx);
+	sctx->job.result_digest[0] = SHA256_H0;
+	sctx->job.result_digest[1] = SHA256_H1;
+	sctx->job.result_digest[2] = SHA256_H2;
+	sctx->job.result_digest[3] = SHA256_H3;
+	sctx->job.result_digest[4] = SHA256_H4;
+	sctx->job.result_digest[5] = SHA256_H5;
+	sctx->job.result_digest[6] = SHA256_H6;
+	sctx->job.result_digest[7] = SHA256_H7;
+	sctx->total_length = 0;
+	sctx->partial_block_buffer_length = 0;
+	sctx->status = HASH_CTX_STS_IDLE;
+
+	return 0;
+}
+
+static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+	int	i;
+	struct	sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+	__be32	*dst = (__be32 *) rctx->out;
+
+	for (i = 0; i < 8; ++i)
+		dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+	return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+			struct mcryptd_alg_cstate *cstate, bool flush)
+{
+	int	flag = HASH_UPDATE;
+	int	nbytes, err = 0;
+	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+	struct sha256_hash_ctx *sha_ctx;
+
+	/* more work ? */
+	while (!(rctx->flag & HASH_DONE)) {
+		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+		if (nbytes < 0) {
+			err = nbytes;
+			goto out;
+		}
+		/* check if the walk is done */
+		if (crypto_ahash_walk_last(&rctx->walk)) {
+			rctx->flag |= HASH_DONE;
+			if (rctx->flag & HASH_FINAL)
+				flag |= HASH_LAST;
+
+		}
+		sha_ctx = (struct sha256_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
+		kernel_fpu_begin();
+		sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
+		if (!sha_ctx) {
+			if (flush)
+				sha_ctx = sha256_ctx_mgr_flush(cstate->mgr);
+		}
+		kernel_fpu_end();
+		if (sha_ctx)
+			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		else {
+			rctx = NULL;
+			goto out;
+		}
+	}
+
+	/* copy the results */
+	if (rctx->flag & HASH_FINAL)
+		sha256_mb_set_results(rctx);
+
+out:
+	*ret_rctx = rctx;
+	return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+			    struct mcryptd_alg_cstate *cstate,
+			    int err)
+{
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	struct mcryptd_hash_request_ctx *req_ctx;
+	int ret;
+
+	/* remove from work list */
+	spin_lock(&cstate->work_lock);
+	list_del(&rctx->waiter);
+	spin_unlock(&cstate->work_lock);
+
+	if (irqs_disabled())
+		rctx->complete(&req->base, err);
+	else {
+		local_bh_disable();
+		rctx->complete(&req->base, err);
+		local_bh_enable();
+	}
+
+	/* check to see if there are other jobs that are done */
+	sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+	while (sha_ctx) {
+		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		ret = sha_finish_walk(&req_ctx, cstate, false);
+		if (req_ctx) {
+			spin_lock(&cstate->work_lock);
+			list_del(&req_ctx->waiter);
+			spin_unlock(&cstate->work_lock);
+
+			req = cast_mcryptd_ctx_to_req(req_ctx);
+			if (irqs_disabled())
+				rctx->complete(&req->base, ret);
+			else {
+				local_bh_disable();
+				rctx->complete(&req->base, ret);
+				local_bh_enable();
+			}
+		}
+		sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+	}
+
+	return 0;
+}
+
+static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+			     struct mcryptd_alg_cstate *cstate)
+{
+	unsigned long next_flush;
+	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+	/* initialize tag */
+	rctx->tag.arrival = jiffies;    /* tag the arrival time */
+	rctx->tag.seq_num = cstate->next_seq_num++;
+	next_flush = rctx->tag.arrival + delay;
+	rctx->tag.expire = next_flush;
+
+	spin_lock(&cstate->work_lock);
+	list_add_tail(&rctx->waiter, &cstate->work_list);
+	spin_unlock(&cstate->work_lock);
+
+	mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha256_mb_update(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk))
+		rctx->flag |= HASH_DONE;
+
+	/* submit */
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	sha256_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_finup(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0, flag = HASH_UPDATE, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk)) {
+		rctx->flag |= HASH_DONE;
+		flag = HASH_LAST;
+	}
+
+	/* submit */
+	rctx->flag |= HASH_FINAL;
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	sha256_mb_add_list(rctx, cstate);
+
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_final(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+			areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0;
+	u8 data;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	rctx->flag |= HASH_DONE | HASH_FINAL;
+
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	/* flag HASH_FINAL and 0 data size */
+	sha256_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_export(struct ahash_request *areq, void *out)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_mb_import(struct ahash_request *areq, const void *in)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
+
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha256_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha256_hash_ctx));
+
+	return 0;
+}
+
+static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha256_mb_areq_alg = {
+	.init		=	sha256_mb_init,
+	.update		=	sha256_mb_update,
+	.final		=	sha256_mb_final,
+	.finup		=	sha256_mb_finup,
+	.export		=	sha256_mb_export,
+	.import		=	sha256_mb_import,
+	.halg		=	{
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.statesize	=	sizeof(struct sha256_hash_ctx),
+		.base		=	{
+			.cra_name	 = "__sha256-mb",
+			.cra_driver_name = "__intel_sha256-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA256_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha256_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha256_mb_areq_init_tfm,
+			.cra_exit	= sha256_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha256_hash_ctx),
+		}
+	}
+};
+
+static int sha256_mb_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha256_mb_async_update(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha256_mb_async_finup(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha256_mb_async_final(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha256_mb_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha256_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha256_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct ahash_request *areq;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+	areq = &rctx->areq;
+
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha256_mb_async_alg = {
+	.init           = sha256_mb_async_init,
+	.update         = sha256_mb_async_update,
+	.final          = sha256_mb_async_final,
+	.finup          = sha256_mb_async_finup,
+	.export         = sha256_mb_async_export,
+	.import         = sha256_mb_async_import,
+	.digest         = sha256_mb_async_digest,
+	.halg = {
+		.digestsize     = SHA256_DIGEST_SIZE,
+		.statesize      = sizeof(struct sha256_hash_ctx),
+		.base = {
+			.cra_name               = "sha256",
+			.cra_driver_name        = "sha256_mb",
+			.cra_priority           = 200,
+			.cra_flags              = CRYPTO_ALG_TYPE_AHASH |
+							CRYPTO_ALG_ASYNC,
+			.cra_blocksize          = SHA256_BLOCK_SIZE,
+			.cra_type               = &crypto_ahash_type,
+			.cra_module             = THIS_MODULE,
+			.cra_list               = LIST_HEAD_INIT
+				(sha256_mb_async_alg.halg.base.cra_list),
+			.cra_init               = sha256_mb_async_init_tfm,
+			.cra_exit               = sha256_mb_async_exit_tfm,
+			.cra_ctxsize		= sizeof(struct sha256_mb_ctx),
+			.cra_alignmask		= 0,
+		},
+	},
+};
+
+static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+	struct mcryptd_hash_request_ctx *rctx;
+	unsigned long cur_time;
+	unsigned long next_flush = 0;
+	struct sha256_hash_ctx *sha_ctx;
+
+
+	cur_time = jiffies;
+
+	while (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		if (time_before(cur_time, rctx->tag.expire))
+			break;
+		kernel_fpu_begin();
+		sha_ctx = (struct sha256_hash_ctx *)
+					sha256_ctx_mgr_flush(cstate->mgr);
+		kernel_fpu_end();
+		if (!sha_ctx) {
+			pr_err("sha256_mb error: nothing got"
+					" flushed for non-empty list\n");
+			break;
+		}
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		sha_finish_walk(&rctx, cstate, true);
+		sha_complete_job(rctx, cstate, 0);
+	}
+
+	if (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		/* get the hash context and then flush time */
+		next_flush = rctx->tag.expire;
+		mcryptd_arm_flusher(cstate, get_delay(next_flush));
+	}
+	return next_flush;
+}
+
+static int __init sha256_mb_mod_init(void)
+{
+
+	int cpu;
+	int err;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	/* check for dependent cpu features */
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_BMI2))
+		return -ENODEV;
+
+	/* initialize multibuffer structures */
+	sha256_mb_alg_state.alg_cstate = alloc_percpu
+						(struct mcryptd_alg_cstate);
+
+	sha256_job_mgr_init = sha256_mb_mgr_init_avx2;
+	sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2;
+	sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2;
+	sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2;
+
+	if (!sha256_mb_alg_state.alg_cstate)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		cpu_state->next_flush = 0;
+		cpu_state->next_seq_num = 0;
+		cpu_state->flusher_engaged = false;
+		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+		cpu_state->cpu = cpu;
+		cpu_state->alg_state = &sha256_mb_alg_state;
+		cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr),
+					GFP_KERNEL);
+		if (!cpu_state->mgr)
+			goto err2;
+		sha256_ctx_mgr_init(cpu_state->mgr);
+		INIT_LIST_HEAD(&cpu_state->work_list);
+		spin_lock_init(&cpu_state->work_lock);
+	}
+	sha256_mb_alg_state.flusher = &sha256_mb_flusher;
+
+	err = crypto_register_ahash(&sha256_mb_areq_alg);
+	if (err)
+		goto err2;
+	err = crypto_register_ahash(&sha256_mb_async_alg);
+	if (err)
+		goto err1;
+
+
+	return 0;
+err1:
+	crypto_unregister_ahash(&sha256_mb_areq_alg);
+err2:
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha256_mb_alg_state.alg_cstate);
+	return -ENODEV;
+}
+
+static void __exit sha256_mb_mod_fini(void)
+{
+	int cpu;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	crypto_unregister_ahash(&sha256_mb_async_alg);
+	crypto_unregister_ahash(&sha256_mb_areq_alg);
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha256_mb_alg_state.alg_cstate);
+}
+
+module_init(sha256_mb_mod_init);
+module_exit(sha256_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
-- 
cgit v1.2.3


From a377c6b1876e7ac847a124998e828baf9d8c89c2 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Thu, 23 Jun 2016 18:40:44 -0700
Subject: crypto: sha256-mb - submit/flush routines for AVX2

This patch introduces the routines used to submit and flush buffers
belonging to SHA256 crypto jobs to the SHA256 multibuffer algorithm. It
is implemented mostly in assembly optimized with AVX2 instructions.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 .../crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S    | 304 +++++++++++++++++++++
 .../x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c |  65 +++++
 .../crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S   | 215 +++++++++++++++
 3 files changed, 584 insertions(+)
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..b691da981cd9
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -0,0 +1,304 @@
+/*
+ * Flush routine for SHA256 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+#LINUX register definitions
+#define arg1	%rdi
+#define arg2	%rsi
+
+# Common register definitions
+#define state	arg1
+#define job	arg2
+#define len2	arg2
+
+# idx must be a register not clobberred by sha1_mult
+#define idx		%r8
+#define DWORD_idx	%r8d
+
+#define unused_lanes	%rbx
+#define lane_data	%rbx
+#define tmp2		%rbx
+#define tmp2_w		%ebx
+
+#define job_rax		%rax
+#define tmp1		%rax
+#define size_offset	%rax
+#define tmp		%rax
+#define start_offset	%rax
+
+#define tmp3		%arg1
+
+#define extra_blocks	%arg2
+#define p		%arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha256_mb_mgr_flush_avx2)
+	FRAME_BEGIN
+        push    %rbx
+
+	# If bit (32+3) is set, then all lanes are empty
+	mov	_unused_lanes(state), unused_lanes
+	bt	$32+3, unused_lanes
+	jc	return_null
+
+	# find a lane with a non-null job
+	xor	idx, idx
+	offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	one(%rip), idx
+	offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	two(%rip), idx
+	offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	three(%rip), idx
+	offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	four(%rip), idx
+	offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	five(%rip), idx
+	offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	six(%rip), idx
+	offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	seven(%rip), idx
+
+	# copy idx to empty lanes
+copy_lane_data:
+	offset =  (_args + _data_ptr)
+	mov	offset(state,idx,8), tmp
+
+	I = 0
+.rep 8
+	offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+.altmacro
+	JNE_SKIP %I
+	offset =  (_args + _data_ptr + 8*I)
+	mov	tmp, offset(state)
+	offset =  (_lens + 4*I)
+	movl	$0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+	I = (I+1)
+.noaltmacro
+.endr
+
+	# Find min length
+	vmovdqa _lens+0*16(state), %xmm0
+	vmovdqa _lens+1*16(state), %xmm1
+
+	vpminud %xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud %xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud %xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	mov	idx, len2
+	and	$0xF, idx
+	shr	$4, len2
+	jz	len_is_0
+
+	vpand	clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd	$0, %xmm2, %xmm2
+
+	vpsubd	%xmm2, %xmm0, %xmm0
+	vpsubd	%xmm2, %xmm1, %xmm1
+
+	vmovdqa	%xmm0, _lens+0*16(state)
+	vmovdqa	%xmm1, _lens+1*16(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call	sha256_x8_avx2
+	# state and idx are intact
+
+len_is_0:
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	movq	$0, _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	mov	_unused_lanes(state), unused_lanes
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+
+	mov	unused_lanes, _unused_lanes(state)
+	movl	$0xFFFFFFFF, _lens(state,idx,4)
+
+	vmovd	_args_digest(state , idx, 4) , %xmm0
+	vpinsrd	$1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	vmovd	_args_digest+4*32(state, idx, 4), %xmm1
+	vpinsrd	$1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	offset =  (_result_digest + 1*16)
+	vmovdqu	%xmm1, offset(job_rax)
+
+return:
+	pop     %rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+ENDPROC(sha256_mb_mgr_flush_avx2)
+
+##############################################################################
+
+.align 16
+ENTRY(sha256_mb_mgr_get_comp_job_avx2)
+	push	%rbx
+
+	## if bit 32+3 is set, then all lanes are empty
+	mov	_unused_lanes(state), unused_lanes
+	bt	$(32+3), unused_lanes
+	jc	.return_null
+
+	# Find min length
+	vmovdqa	_lens(state), %xmm0
+	vmovdqa	_lens+1*16(state), %xmm1
+
+	vpminud	%xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	test	$~0xF, idx
+	jnz	.return_null
+
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	movq	$0,  _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	mov	_unused_lanes(state), unused_lanes
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+	mov	unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state,  idx, 4)
+
+	vmovd	_args_digest(state, idx, 4), %xmm0
+	vpinsrd	$1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	movl	_args_digest+4*32(state, idx, 4), tmp2_w
+	vpinsrd	$1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	movl	tmp2_w, _result_digest+1*16(job_rax)
+
+	pop	%rbx
+
+	ret
+
+.return_null:
+	xor	job_rax, job_rax
+	pop	%rbx
+	ret
+ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+.octa	0x000000000000000000000000FFFFFFF0
+one:
+.quad	1
+two:
+.quad	2
+three:
+.quad	3
+four:
+.quad	4
+five:
+.quad	5
+six:
+.quad	6
+seven:
+.quad  7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..b0c498371e67
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,65 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha256_mb_mgr.h"
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
+{
+	unsigned int j;
+
+	state->unused_lanes = 0xF76543210ULL;
+	for (j = 0; j < 8; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = NULL;
+	}
+}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..7ea670e25acc
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -0,0 +1,215 @@
+/*
+ * Buffer submit code for multi buffer SHA256 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+# LINUX register definitions
+arg1		= %rdi
+arg2		= %rsi
+size_offset	= %rcx
+tmp2		= %rcx
+extra_blocks	= %rdx
+
+# Common definitions
+#define state	arg1
+#define job	%rsi
+#define len2	arg2
+#define p2	arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx		= %r8
+DWORD_idx	= %r8d
+last_len	= %r8
+
+p		= %r11
+start_offset	= %r11
+
+unused_lanes	= %rbx
+BYTE_unused_lanes = %bl
+
+job_rax		= %rax
+len		= %rax
+DWORD_len	= %eax
+
+lane		= %r12
+tmp3		= %r12
+
+tmp		= %r9
+DWORD_tmp	= %r9d
+
+lane_data	= %r10
+
+# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha256_mb_mgr_submit_avx2)
+	FRAME_BEGIN
+	push	%rbx
+	push	%r12
+
+	mov	_unused_lanes(state), unused_lanes
+	mov	unused_lanes, lane
+	and	$0xF, lane
+	shr	$4, unused_lanes
+	imul	$_LANE_DATA_size, lane, lane_data
+	movl	$STS_BEING_PROCESSED, _status(job)
+	lea	_ldata(state, lane_data), lane_data
+	mov	unused_lanes, _unused_lanes(state)
+	movl	_len(job),  DWORD_len
+
+	mov	job, _job_in_lane(lane_data)
+	shl	$4, len
+	or	lane, len
+
+	movl	DWORD_len,  _lens(state , lane, 4)
+
+	# Load digest words from result_digest
+	vmovdqu	_result_digest(job), %xmm0
+	vmovdqu	_result_digest+1*16(job), %xmm1
+	vmovd	%xmm0, _args_digest(state, lane, 4)
+	vpextrd	$1, %xmm0, _args_digest+1*32(state , lane, 4)
+	vpextrd	$2, %xmm0, _args_digest+2*32(state , lane, 4)
+	vpextrd	$3, %xmm0, _args_digest+3*32(state , lane, 4)
+	vmovd	%xmm1, _args_digest+4*32(state , lane, 4)
+
+	vpextrd	$1, %xmm1, _args_digest+5*32(state , lane, 4)
+	vpextrd	$2, %xmm1, _args_digest+6*32(state , lane, 4)
+	vpextrd	$3, %xmm1, _args_digest+7*32(state , lane, 4)
+
+	mov	_buffer(job), p
+	mov	p, _args_data_ptr(state, lane, 8)
+
+	cmp	$0xF, unused_lanes
+	jne	return_null
+
+start_loop:
+	# Find min length
+	vmovdqa	_lens(state), %xmm0
+	vmovdqa	_lens+1*16(state), %xmm1
+
+	vpminud	%xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	mov	idx, len2
+	and	$0xF, idx
+	shr	$4, len2
+	jz	len_is_0
+
+	vpand	clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd	$0, %xmm2, %xmm2
+
+	vpsubd	%xmm2, %xmm0, %xmm0
+	vpsubd	%xmm2, %xmm1, %xmm1
+
+	vmovdqa	%xmm0, _lens + 0*16(state)
+	vmovdqa	%xmm1, _lens + 1*16(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call	sha256_x8_avx2
+
+	# state and idx are intact
+
+len_is_0:
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	mov	_unused_lanes(state), unused_lanes
+	movq	$0, _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+	mov	unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state,idx,4)
+
+	vmovd	_args_digest(state, idx, 4), %xmm0
+	vpinsrd	$1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+	vmovd	_args_digest+4*32(state, idx, 4), %xmm1
+
+	vpinsrd	$1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	vmovdqu	%xmm1, _result_digest+1*16(job_rax)
+
+return:
+	pop     %r12
+        pop     %rbx
+        FRAME_END
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+ENDPROC(sha256_mb_mgr_submit_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+	.octa	0x000000000000000000000000FFFFFFF0
-- 
cgit v1.2.3


From 98cf10383a5507147793789bc5c2c02688df44b2 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Thu, 23 Jun 2016 18:40:45 -0700
Subject: crypto: sha256-mb - Algorithm data structures

This patch introduces the data structures and prototypes of
functions needed for computing SHA256 hash using multi-buffer.
Included are the structures of the multi-buffer SHA256 job,
job scheduler in C and x86 assembly.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-mb/sha256_mb_ctx.h          | 136 +++++++++
 arch/x86/crypto/sha256-mb/sha256_mb_mgr.h          | 108 ++++++++
 .../crypto/sha256-mb/sha256_mb_mgr_datastruct.S    | 304 +++++++++++++++++++++
 3 files changed, 548 insertions(+)
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
new file mode 100644
index 000000000000..edd252b73206
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
@@ -0,0 +1,136 @@
+/*
+ * Header file for multi buffer SHA256 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha256_mb_mgr.h"
+
+#define HASH_UPDATE          0x00
+#define HASH_FIRST           0x01
+#define HASH_LAST            0x02
+#define HASH_ENTIRE          0x03
+#define HASH_DONE	     0x04
+#define HASH_FINAL	     0x08
+
+#define HASH_CTX_STS_IDLE       0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST       0x02
+#define HASH_CTX_STS_COMPLETE   0x04
+
+enum hash_ctx_error {
+	HASH_CTX_ERROR_NONE               =  0,
+	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
+	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
+
+#ifdef HASH_CTX_DEBUG
+	HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA256_DIGEST_LENGTH        8
+#define SHA256_LOG2_BLOCK_SIZE        6
+
+#define SHA256_PADLENGTHFIELD_SIZE    8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+	if (unlikely(!(expr))) { \
+		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+		#expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha256_ctx_mgr {
+	struct sha256_mb_mgr mgr;
+};
+
+/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
+
+struct sha256_hash_ctx {
+	/* Must be at struct offset 0 */
+	struct job_sha256       job;
+	/* status flag */
+	int status;
+	/* error flag */
+	int error;
+
+	uint32_t	total_length;
+	const void	*incoming_buffer;
+	uint32_t	incoming_buffer_length;
+	uint8_t		partial_block_buffer[SHA256_BLOCK_SIZE * 2];
+	uint32_t	partial_block_buffer_length;
+	void		*user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
new file mode 100644
index 000000000000..b01ae408c56d
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
@@ -0,0 +1,108 @@
+/*
+ * Header file for multi buffer SHA256 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA256_DIGEST_WORDS 8
+
+enum job_sts {	STS_UNKNOWN = 0,
+		STS_BEING_PROCESSED = 1,
+		STS_COMPLETED = 2,
+		STS_INTERNAL_ERROR = 3,
+		STS_ERROR = 4
+};
+
+struct job_sha256 {
+	u8	*buffer;
+	u32	len;
+	u32	result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
+	enum	job_sts status;
+	void	*user_data;
+};
+
+/* SHA256 out-of-order scheduler */
+
+/* typedef uint32_t sha8_digest_array[8][8]; */
+
+struct sha256_args_x8 {
+	uint32_t	digest[8][8];
+	uint8_t		*data_ptr[8];
+};
+
+struct sha256_lane_data {
+	struct job_sha256 *job_in_lane;
+};
+
+struct sha256_mb_mgr {
+	struct sha256_args_x8 args;
+
+	uint32_t lens[8];
+
+	/* each byte is index (0...7) of unused lanes */
+	uint64_t unused_lanes;
+	/* byte 4 is set to FF as a flag */
+	struct sha256_lane_data ldata[8];
+};
+
+
+#define SHA256_MB_MGR_NUM_LANES_AVX2 8
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
+					 struct job_sha256 *job);
+struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..5c377bac21d0
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
@@ -0,0 +1,304 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS	# JOB_AES
+###	name		size	align
+#FIELD	_plaintext,	8,	8	# pointer to plaintext
+#FIELD	_ciphertext,	8,	8	# pointer to ciphertext
+#FIELD	_IV,		16,	8	# IV
+#FIELD	_keys,		8,	8	# pointer to keys
+#FIELD	_len,		4,	4	# length in bytes
+#FIELD	_status,	4,	4	# status enumeration
+#FIELD	_user_data,	8,	8	# pointer to user data
+#UNION  _union,         size1,  align1, \
+#	                size2,  align2, \
+#	                size3,  align3, \
+#	                ...
+#END_FIELDS
+#%assign _JOB_AES_size	_FIELD_OFFSET
+#%assign _JOB_AES_align	_STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+#	STRUCT job_aes2
+#	RES_Q	.plaintext,	1
+#	RES_Q	.ciphertext, 	1
+#	RES_DQ	.IV,		1
+#	RES_B	.nested,	_JOB_AES_SIZE, _JOB_AES_ALIGN
+#	RES_U	.union,		size1, align1, \
+#				size2, align2, \
+#				...
+#	ENDSTRUCT
+#	# Following only needed if nesting
+#	%assign job_aes2_size	_FIELD_OFFSET
+#	%assign job_aes2_align	_STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro    Base size
+# RES_B	    1
+# RES_W	    2
+# RES_D     4
+# RES_Q     8
+# RES_DQ   16
+# RES_Y    32
+# RES_Z    64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define SZ8			8*SHA256_DIGEST_WORD_SIZE
+#define ROUNDS			64*SZ8
+#define PTR_SZ                  8
+#define SHA256_DIGEST_WORD_SIZE 4
+#define MAX_SHA256_LANES        8
+#define SHA256_DIGEST_WORDS 8
+#define SHA256_DIGEST_ROW_SIZE  (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+#define SHA256_DIGEST_SIZE      (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
+#define SHA256_BLK_SZ           64
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name	= _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+	.lcomm	tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+
+########################################################################
+#### Define SHA256 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS    # LANE_DATA
+###     name            size    align
+FIELD   _job_in_lane,   8,      8       # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS    # SHA256_ARGS_X4
+###     name            size    align
+FIELD   _digest,        4*8*8,  4       # transposed digest
+FIELD   _data_ptr,      8*8,    8       # array of pointers to data
+END_FIELDS
+
+ _SHA256_ARGS_X4_size  =  _FIELD_OFFSET
+ _SHA256_ARGS_X4_align = _STRUCT_ALIGN
+ _SHA256_ARGS_X8_size  =	_FIELD_OFFSET
+ _SHA256_ARGS_X8_align =	_STRUCT_ALIGN
+
+#######################################################################
+
+START_FIELDS    # MB_MGR
+###     name            size    align
+FIELD   _args,          _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD   _lens,          4*8,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size  =  _FIELD_OFFSET
+ _MB_MGR_align =  _STRUCT_ALIGN
+
+_args_digest   =     _args + _digest
+_args_data_ptr =     _args + _data_ptr
+
+#######################################################################
+
+START_FIELDS    #STACK_FRAME
+###     name            size    align
+FIELD   _data,		16*SZ8,   1       # transposed digest
+FIELD   _digest,         8*SZ8,   1       # array of pointers to data
+FIELD   _ytmp,           4*SZ8,   1
+FIELD   _rsp,            8,       1
+END_FIELDS
+
+ _STACK_FRAME_size  =  _FIELD_OFFSET
+ _STACK_FRAME_align =  _STRUCT_ALIGN
+
+#######################################################################
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN             0
+#define STS_BEING_PROCESSED     1
+#define STS_COMPLETED           2
+
+########################################################################
+#### Define JOB_SHA256 structure
+########################################################################
+
+START_FIELDS    # JOB_SHA256
+
+###     name                            size    align
+FIELD   _buffer,                        8,      8       # pointer to buffer
+FIELD   _len,                           8,      8       # length in bytes
+FIELD   _result_digest,                 8*4,    32      # Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+ _JOB_SHA256_size = _FIELD_OFFSET
+ _JOB_SHA256_align = _STRUCT_ALIGN
-- 
cgit v1.2.3


From 992532474ffa954ff678627a1c0f815d7b6cd7fc Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@intel.com>
Date: Thu, 23 Jun 2016 18:40:46 -0700
Subject: crypto: sha256-mb - Crypto computation (x8 AVX2)

This patch introduces the assembly routines to do SHA256 computation
on buffers belonging to several jobs at once.  The assembly routines
are optimized with AVX2 instructions that have 8 data lanes and using
AVX2 registers.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-mb/sha256_x8_avx2.S | 593 +++++++++++++++++++++++++++++
 1 file changed, 593 insertions(+)
 create mode 100644 arch/x86/crypto/sha256-mb/sha256_x8_avx2.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
new file mode 100644
index 000000000000..aa21aea4c722
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -0,0 +1,593 @@
+/*
+ * Multi-buffer SHA256 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+## code to compute oct SHA256 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+## Logic designed/laid out by JDG
+
+## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; %ymm0-15
+## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+## Linux preserves:                       rdi rbp r8
+##
+## clobbers %ymm0-15
+
+arg1 = %rdi
+arg2 = %rsi
+reg3 = %rcx
+reg4 = %rdx
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = reg3
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = reg4
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+T1 = %ymm8
+
+a0 = %ymm12
+a1 = %ymm13
+a2 = %ymm14
+TMP = %ymm15
+TMP0 = %ymm6
+TMP1 = %ymm7
+
+TT0 = %ymm8
+TT1 = %ymm9
+TT2 = %ymm10
+TT3 = %ymm11
+TT4 = %ymm12
+TT5 = %ymm13
+TT6 = %ymm14
+TT7 = %ymm15
+
+# Define stack usage
+
+# Assume stack aligned to 32 bytes before call
+# Therefore FRAMESZ mod 32 must be 32-8 = 24
+
+#define FRAMESZ	0x388
+
+#define VMOVPS	vmovups
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+	# process top half (r0..r3) {a...d}
+	vshufps	$0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	$0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	$0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	$0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	$0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	$0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	$0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	$0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	# use r2 in place of t0
+	# process bottom half (r4..r7) {e...h}
+	vshufps	$0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	$0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps	$0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	$0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	$0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	$0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	$0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	$0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	$0x13, \r1, \r5, \r6  # h6...a6
+	vperm2f128	$0x02, \r1, \r5, \r2  # h2...a2
+	vperm2f128	$0x13, \r3, \r7, \r5  # h5...a5
+	vperm2f128	$0x02, \r3, \r7, \r1  # h1...a1
+	vperm2f128	$0x13, \r0, \r4, \r7  # h7...a7
+	vperm2f128	$0x02, \r0, \r4, \r3  # h3...a3
+	vperm2f128	$0x13, \t0, \t1, \r4  # h4...a4
+	vperm2f128	$0x02, \t0, \t1, \r0  # h0...a0
+
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro _PRORD reg imm tmp
+	vpslld	$(32-\imm),\reg,\tmp
+	vpsrld	$\imm,\reg, \reg
+	vpor	\tmp,\reg, \reg
+.endm
+
+# PRORD_nd reg, imm, tmp, src
+.macro _PRORD_nd reg imm tmp src
+	vpslld	$(32-\imm), \src, \tmp
+	vpsrld	$\imm, \src, \reg
+	vpor	\tmp, \reg, \reg
+.endm
+
+# PRORD dst/src, amt
+.macro PRORD reg imm
+	_PRORD	\reg,\imm,TMP
+.endm
+
+# PRORD_nd dst, src, amt
+.macro PRORD_nd reg tmp imm
+	_PRORD_nd	\reg, \imm, TMP, \tmp
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+	PRORD_nd	a0,e,5	# sig1: a0 = (e >> 5)
+
+	vpxor	g, f, a2	# ch: a2 = f^g
+	vpand	e,a2, a2	# ch: a2 = (f^g)&e
+	vpxor	g, a2, a2	# a2 = ch
+
+	PRORD_nd	a1,e,25	# sig1: a1 = (e >> 25)
+
+	vmovdqu	\_T1,(SZ8*(\i & 0xf))(%rsp)
+	vpaddd	(TBL,ROUND,1), \_T1, \_T1	# T1 = W + K
+	vpxor	e,a0, a0	# sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		# sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	a2, h, h	# h = h + ch
+	PRORD_nd	a2,a,11	# sig0: a2 = (a >> 11)
+	vpaddd	\_T1,h, h 	# h = h + ch + W + K
+	vpxor	a1, a0, a0	# a0 = sigma1
+	PRORD_nd	a1,a,22	# sig0: a1 = (a >> 22)
+	vpxor	c, a, \_T1	# maj: T1 = a^c
+	add	$SZ8, ROUND	# ROUND++
+	vpand	b, \_T1, \_T1	# maj: T1 = (a^c)&b
+	vpaddd	a0, h, h
+	vpaddd	h, d, d
+	vpxor	a, a2, a2	# sig0: a2 = a ^ (a >> 11)
+	PRORD	a2,2		# sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a1, a2, a2	# a2 = sig0
+	vpand	c, a, a1	# maj: a1 = a&c
+	vpor	\_T1, a1, a1 	# a1 = maj
+	vpaddd	a1, h, h	# h = h + ch + W + K + maj
+	vpaddd	a2, h, h	# h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+	vmovdqu	(SZ8*((\i-15)&0xf))(%rsp), \_T1
+	vmovdqu	(SZ8*((\i-2)&0xf))(%rsp), a1
+	vmovdqu	\_T1, a0
+	PRORD	\_T1,11
+	vmovdqu	a1, a2
+	PRORD	a1,2
+	vpxor	a0, \_T1, \_T1
+	PRORD	\_T1, 7
+	vpxor	a2, a1, a1
+	PRORD	a1, 17
+	vpsrld	$3, a0, a0
+	vpxor	a0, \_T1, \_T1
+	vpsrld	$10, a2, a2
+	vpxor	a2, a1, a1
+	vpaddd	(SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
+	vpaddd	(SZ8*((\i-7)&0xf))(%rsp), a1, a1
+	vpaddd	a1, \_T1, \_T1
+
+	ROUND_00_15 \_T1,\i
+.endm
+
+# SHA256_ARGS:
+#   UINT128 digest[8];  // transposed digests
+#   UINT8  *data_ptr[4];
+
+# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
+# arg 1 : STATE : pointer to array of pointers to input data
+# arg 2 : INP_SIZE  : size of input in blocks
+	# general registers preserved in outer calling routine
+	# outer calling routine saves all the XMM registers
+	# save rsp, allocate 32-byte aligned for local variables
+ENTRY(sha256_x8_avx2)
+
+	# save callee-saved clobbered registers to comply with C function ABI
+	push    %r12
+	push    %r13
+	push    %r14
+	push    %r15
+
+	mov	%rsp, IDX
+	sub	$FRAMESZ, %rsp
+	and	$~0x1F, %rsp
+	mov	IDX, _rsp(%rsp)
+
+	# Load the pre-transposed incoming digest.
+	vmovdqu	0*SHA256_DIGEST_ROW_SIZE(STATE),a
+	vmovdqu	1*SHA256_DIGEST_ROW_SIZE(STATE),b
+	vmovdqu	2*SHA256_DIGEST_ROW_SIZE(STATE),c
+	vmovdqu	3*SHA256_DIGEST_ROW_SIZE(STATE),d
+	vmovdqu	4*SHA256_DIGEST_ROW_SIZE(STATE),e
+	vmovdqu	5*SHA256_DIGEST_ROW_SIZE(STATE),f
+	vmovdqu	6*SHA256_DIGEST_ROW_SIZE(STATE),g
+	vmovdqu	7*SHA256_DIGEST_ROW_SIZE(STATE),h
+
+	lea	K256_8(%rip),TBL
+
+	# load the address of each of the 4 message lanes
+	# getting ready to transpose input onto stack
+	mov	_args_data_ptr+0*PTR_SZ(STATE),inp0
+	mov	_args_data_ptr+1*PTR_SZ(STATE),inp1
+	mov	_args_data_ptr+2*PTR_SZ(STATE),inp2
+	mov	_args_data_ptr+3*PTR_SZ(STATE),inp3
+	mov	_args_data_ptr+4*PTR_SZ(STATE),inp4
+	mov	_args_data_ptr+5*PTR_SZ(STATE),inp5
+	mov	_args_data_ptr+6*PTR_SZ(STATE),inp6
+	mov	_args_data_ptr+7*PTR_SZ(STATE),inp7
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	# save old digest
+	vmovdqu	a, _digest(%rsp)
+	vmovdqu	b, _digest+1*SZ8(%rsp)
+	vmovdqu	c, _digest+2*SZ8(%rsp)
+	vmovdqu	d, _digest+3*SZ8(%rsp)
+	vmovdqu	e, _digest+4*SZ8(%rsp)
+	vmovdqu	f, _digest+5*SZ8(%rsp)
+	vmovdqu	g, _digest+6*SZ8(%rsp)
+	vmovdqu	h, _digest+7*SZ8(%rsp)
+	i = 0
+.rep 2
+	VMOVPS	i*32(inp0, IDX), TT0
+	VMOVPS	i*32(inp1, IDX), TT1
+	VMOVPS	i*32(inp2, IDX), TT2
+	VMOVPS	i*32(inp3, IDX), TT3
+	VMOVPS	i*32(inp4, IDX), TT4
+	VMOVPS	i*32(inp5, IDX), TT5
+	VMOVPS	i*32(inp6, IDX), TT6
+	VMOVPS	i*32(inp7, IDX), TT7
+	vmovdqu	g, _ytmp(%rsp)
+	vmovdqu	h, _ytmp+1*SZ8(%rsp)
+	TRANSPOSE8	TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
+	vmovdqu	PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
+	vmovdqu	_ytmp(%rsp), g
+	vpshufb	TMP1, TT0, TT0
+	vpshufb	TMP1, TT1, TT1
+	vpshufb	TMP1, TT2, TT2
+	vpshufb	TMP1, TT3, TT3
+	vpshufb	TMP1, TT4, TT4
+	vpshufb	TMP1, TT5, TT5
+	vpshufb	TMP1, TT6, TT6
+	vpshufb	TMP1, TT7, TT7
+	vmovdqu	_ytmp+1*SZ8(%rsp), h
+	vmovdqu	TT4, _ytmp(%rsp)
+	vmovdqu	TT5, _ytmp+1*SZ8(%rsp)
+	vmovdqu	TT6, _ytmp+2*SZ8(%rsp)
+	vmovdqu	TT7, _ytmp+3*SZ8(%rsp)
+	ROUND_00_15	TT0,(i*8+0)
+	vmovdqu	_ytmp(%rsp), TT0
+	ROUND_00_15	TT1,(i*8+1)
+	vmovdqu	_ytmp+1*SZ8(%rsp), TT1
+	ROUND_00_15	TT2,(i*8+2)
+	vmovdqu	_ytmp+2*SZ8(%rsp), TT2
+	ROUND_00_15	TT3,(i*8+3)
+	vmovdqu	_ytmp+3*SZ8(%rsp), TT3
+	ROUND_00_15	TT0,(i*8+4)
+	ROUND_00_15	TT1,(i*8+5)
+	ROUND_00_15	TT2,(i*8+6)
+	ROUND_00_15	TT3,(i*8+7)
+	i = (i+1)
+.endr
+	add	$64, IDX
+	i = (i*8)
+
+	jmp	Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+	ROUND_16_XX	T1, i
+	i = (i+1)
+.endr
+
+	cmp	$ROUNDS,ROUND
+	jb	Lrounds_16_xx
+
+	# add old digest
+	vpaddd	_digest+0*SZ8(%rsp), a, a
+	vpaddd	_digest+1*SZ8(%rsp), b, b
+	vpaddd	_digest+2*SZ8(%rsp), c, c
+	vpaddd	_digest+3*SZ8(%rsp), d, d
+	vpaddd	_digest+4*SZ8(%rsp), e, e
+	vpaddd	_digest+5*SZ8(%rsp), f, f
+	vpaddd	_digest+6*SZ8(%rsp), g, g
+	vpaddd	_digest+7*SZ8(%rsp), h, h
+
+	sub	$1, INP_SIZE  # unit is blocks
+	jne	lloop
+
+	# write back to memory (state object) the transposed digest
+	vmovdqu	a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
+
+	# update input pointers
+	add	IDX, inp0
+	mov	inp0, _args_data_ptr+0*8(STATE)
+	add	IDX, inp1
+	mov	inp1, _args_data_ptr+1*8(STATE)
+	add	IDX, inp2
+	mov	inp2, _args_data_ptr+2*8(STATE)
+	add	IDX, inp3
+	mov	inp3, _args_data_ptr+3*8(STATE)
+	add	IDX, inp4
+	mov	inp4, _args_data_ptr+4*8(STATE)
+	add	IDX, inp5
+	mov	inp5, _args_data_ptr+5*8(STATE)
+	add	IDX, inp6
+	mov	inp6, _args_data_ptr+6*8(STATE)
+	add	IDX, inp7
+	mov	inp7, _args_data_ptr+7*8(STATE)
+
+	# Postamble
+	mov	_rsp(%rsp), %rsp
+
+	# restore callee-saved clobbered registers
+	pop     %r15
+	pop     %r14
+	pop     %r13
+	pop     %r12
+
+	ret
+ENDPROC(sha256_x8_avx2)
+.data
+.align 64
+K256_8:
+	.octa	0x428a2f98428a2f98428a2f98428a2f98
+	.octa	0x428a2f98428a2f98428a2f98428a2f98
+	.octa	0x71374491713744917137449171374491
+	.octa	0x71374491713744917137449171374491
+	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+	.octa	0x3956c25b3956c25b3956c25b3956c25b
+	.octa	0x3956c25b3956c25b3956c25b3956c25b
+	.octa	0x59f111f159f111f159f111f159f111f1
+	.octa	0x59f111f159f111f159f111f159f111f1
+	.octa	0x923f82a4923f82a4923f82a4923f82a4
+	.octa	0x923f82a4923f82a4923f82a4923f82a4
+	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+	.octa	0xd807aa98d807aa98d807aa98d807aa98
+	.octa	0xd807aa98d807aa98d807aa98d807aa98
+	.octa	0x12835b0112835b0112835b0112835b01
+	.octa	0x12835b0112835b0112835b0112835b01
+	.octa	0x243185be243185be243185be243185be
+	.octa	0x243185be243185be243185be243185be
+	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
+	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
+	.octa	0x72be5d7472be5d7472be5d7472be5d74
+	.octa	0x72be5d7472be5d7472be5d7472be5d74
+	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
+	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
+	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
+	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
+	.octa	0xc19bf174c19bf174c19bf174c19bf174
+	.octa	0xc19bf174c19bf174c19bf174c19bf174
+	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
+	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
+	.octa	0xefbe4786efbe4786efbe4786efbe4786
+	.octa	0xefbe4786efbe4786efbe4786efbe4786
+	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
+	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
+	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
+	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
+	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
+	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
+	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
+	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
+	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+	.octa	0x76f988da76f988da76f988da76f988da
+	.octa	0x76f988da76f988da76f988da76f988da
+	.octa	0x983e5152983e5152983e5152983e5152
+	.octa	0x983e5152983e5152983e5152983e5152
+	.octa	0xa831c66da831c66da831c66da831c66d
+	.octa	0xa831c66da831c66da831c66da831c66d
+	.octa	0xb00327c8b00327c8b00327c8b00327c8
+	.octa	0xb00327c8b00327c8b00327c8b00327c8
+	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
+	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
+	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+	.octa	0xd5a79147d5a79147d5a79147d5a79147
+	.octa	0xd5a79147d5a79147d5a79147d5a79147
+	.octa	0x06ca635106ca635106ca635106ca6351
+	.octa	0x06ca635106ca635106ca635106ca6351
+	.octa	0x14292967142929671429296714292967
+	.octa	0x14292967142929671429296714292967
+	.octa	0x27b70a8527b70a8527b70a8527b70a85
+	.octa	0x27b70a8527b70a8527b70a8527b70a85
+	.octa	0x2e1b21382e1b21382e1b21382e1b2138
+	.octa	0x2e1b21382e1b21382e1b21382e1b2138
+	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+	.octa	0x53380d1353380d1353380d1353380d13
+	.octa	0x53380d1353380d1353380d1353380d13
+	.octa	0x650a7354650a7354650a7354650a7354
+	.octa	0x650a7354650a7354650a7354650a7354
+	.octa	0x766a0abb766a0abb766a0abb766a0abb
+	.octa	0x766a0abb766a0abb766a0abb766a0abb
+	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
+	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
+	.octa	0x92722c8592722c8592722c8592722c85
+	.octa	0x92722c8592722c8592722c8592722c85
+	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+	.octa	0xa81a664ba81a664ba81a664ba81a664b
+	.octa	0xa81a664ba81a664ba81a664ba81a664b
+	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
+	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
+	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
+	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
+	.octa	0xd192e819d192e819d192e819d192e819
+	.octa	0xd192e819d192e819d192e819d192e819
+	.octa	0xd6990624d6990624d6990624d6990624
+	.octa	0xd6990624d6990624d6990624d6990624
+	.octa	0xf40e3585f40e3585f40e3585f40e3585
+	.octa	0xf40e3585f40e3585f40e3585f40e3585
+	.octa	0x106aa070106aa070106aa070106aa070
+	.octa	0x106aa070106aa070106aa070106aa070
+	.octa	0x19a4c11619a4c11619a4c11619a4c116
+	.octa	0x19a4c11619a4c11619a4c11619a4c116
+	.octa	0x1e376c081e376c081e376c081e376c08
+	.octa	0x1e376c081e376c081e376c081e376c08
+	.octa	0x2748774c2748774c2748774c2748774c
+	.octa	0x2748774c2748774c2748774c2748774c
+	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
+	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
+	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
+	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
+	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
+	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
+	.octa	0x748f82ee748f82ee748f82ee748f82ee
+	.octa	0x748f82ee748f82ee748f82ee748f82ee
+	.octa	0x78a5636f78a5636f78a5636f78a5636f
+	.octa	0x78a5636f78a5636f78a5636f78a5636f
+	.octa	0x84c8781484c8781484c8781484c87814
+	.octa	0x84c8781484c8781484c8781484c87814
+	.octa	0x8cc702088cc702088cc702088cc70208
+	.octa	0x8cc702088cc702088cc702088cc70208
+	.octa	0x90befffa90befffa90befffa90befffa
+	.octa	0x90befffa90befffa90befffa90befffa
+	.octa	0xa4506ceba4506ceba4506ceba4506ceb
+	.octa	0xa4506ceba4506ceba4506ceba4506ceb
+	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+	.octa	0xc67178f2c67178f2c67178f2c67178f2
+	.octa	0xc67178f2c67178f2c67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.align 64
+.global K256
+K256:
+	.int	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.int	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.int	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.int	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.int	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.int	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.int	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.int	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.int	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.int	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.int	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.int	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.int	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.int	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.int	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.int	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-- 
cgit v1.2.3


From 4c79f6f81a23052e5d1f13a3e6e9aaf915621430 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Thu, 23 Jun 2016 18:40:48 -0700
Subject: crypto: sha1-mb - rename sha-mb to sha1-mb

Until now, there was only support for the SHA1 multibuffer algorithm.
Hence, there was just one sha-mb folder. Now, with the introduction of
the SHA256 multi-buffer algorithm , it is logical to name the existing
folder as sha1-mb.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                          |    2 +-
 arch/x86/crypto/sha-mb/Makefile                   |   11 -
 arch/x86/crypto/sha-mb/sha1_mb.c                  | 1028 ---------------------
 arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S   |  287 ------
 arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S   |  302 ------
 arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c    |   64 --
 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S  |  210 -----
 arch/x86/crypto/sha-mb/sha1_x8_avx2.S             |  481 ----------
 arch/x86/crypto/sha-mb/sha_mb_ctx.h               |  136 ---
 arch/x86/crypto/sha-mb/sha_mb_mgr.h               |  110 ---
 arch/x86/crypto/sha1-mb/Makefile                  |   11 +
 arch/x86/crypto/sha1-mb/sha1_mb.c                 | 1028 +++++++++++++++++++++
 arch/x86/crypto/sha1-mb/sha1_mb_ctx.h             |  136 +++
 arch/x86/crypto/sha1-mb/sha1_mb_mgr.h             |  110 +++
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S  |  287 ++++++
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S  |  302 ++++++
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c   |   64 ++
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S |  210 +++++
 arch/x86/crypto/sha1-mb/sha1_x8_avx2.S            |  481 ++++++++++
 19 files changed, 2630 insertions(+), 2630 deletions(-)
 delete mode 100644 arch/x86/crypto/sha-mb/Makefile
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_mb.c
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
 delete mode 100644 arch/x86/crypto/sha-mb/sha1_x8_avx2.S
 delete mode 100644 arch/x86/crypto/sha-mb/sha_mb_ctx.h
 delete mode 100644 arch/x86/crypto/sha-mb/sha_mb_mgr.h
 create mode 100644 arch/x86/crypto/sha1-mb/Makefile
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb.c
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
 create mode 100644 arch/x86/crypto/sha1-mb/sha1_x8_avx2.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index aafff22b0ea6..52c868523921 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -49,7 +49,7 @@ endif
 ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
+	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
 	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
 endif
 
diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha-mb/Makefile
deleted file mode 100644
index 2f8756375df5..000000000000
--- a/arch/x86/crypto/sha-mb/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Arch-specific CryptoAPI modules.
-#
-
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
-                                $(comma)4)$(comma)%ymm2,yes,no)
-ifeq ($(avx2_supported),yes)
-	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
-	sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
-	     sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
-endif
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
deleted file mode 100644
index 669cc37268e1..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ /dev/null
@@ -1,1028 +0,0 @@
-/*
- * Multi buffer SHA1 algorithm Glue Code
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
-#include <crypto/mcryptd.h>
-#include <crypto/crypto_wq.h>
-#include <asm/byteorder.h>
-#include <linux/hardirq.h>
-#include <asm/fpu/api.h>
-#include "sha_mb_ctx.h"
-
-#define FLUSH_INTERVAL 1000 /* in usec */
-
-static struct mcryptd_alg_state sha1_mb_alg_state;
-
-struct sha1_mb_ctx {
-	struct mcryptd_ahash *mcryptd_tfm;
-};
-
-static inline struct mcryptd_hash_request_ctx
-		*cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
-{
-	struct ahash_request *areq;
-
-	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
-	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
-}
-
-static inline struct ahash_request
-		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
-{
-	return container_of((void *) ctx, struct ahash_request, __ctx);
-}
-
-static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
-				struct ahash_request *areq)
-{
-	rctx->flag = HASH_UPDATE;
-}
-
-static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
-			(struct sha1_mb_mgr *state, struct job_sha1 *job);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
-						(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
-						(struct sha1_mb_mgr *state);
-
-static inline void sha1_init_digest(uint32_t *digest)
-{
-	static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0,
-					SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 };
-	memcpy(digest, initial_digest, sizeof(initial_digest));
-}
-
-static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
-			 uint32_t total_len)
-{
-	uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
-
-	memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
-	padblock[i] = 0x80;
-
-	i += ((SHA1_BLOCK_SIZE - 1) &
-	      (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
-	     + 1 + SHA1_PADLENGTHFIELD_SIZE;
-
-#if SHA1_PADLENGTHFIELD_SIZE == 16
-	*((uint64_t *) &padblock[i - 16]) = 0;
-#endif
-
-	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
-
-	/* Number of extra blocks to hash */
-	return i >> SHA1_LOG2_BLOCK_SIZE;
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
-						struct sha1_hash_ctx *ctx)
-{
-	while (ctx) {
-		if (ctx->status & HASH_CTX_STS_COMPLETE) {
-			/* Clear PROCESSING bit */
-			ctx->status = HASH_CTX_STS_COMPLETE;
-			return ctx;
-		}
-
-		/*
-		 * If the extra blocks are empty, begin hashing what remains
-		 * in the user's buffer.
-		 */
-		if (ctx->partial_block_buffer_length == 0 &&
-		    ctx->incoming_buffer_length) {
-
-			const void *buffer = ctx->incoming_buffer;
-			uint32_t len = ctx->incoming_buffer_length;
-			uint32_t copy_len;
-
-			/*
-			 * Only entire blocks can be hashed.
-			 * Copy remainder to extra blocks buffer.
-			 */
-			copy_len = len & (SHA1_BLOCK_SIZE-1);
-
-			if (copy_len) {
-				len -= copy_len;
-				memcpy(ctx->partial_block_buffer,
-				       ((const char *) buffer + len),
-				       copy_len);
-				ctx->partial_block_buffer_length = copy_len;
-			}
-
-			ctx->incoming_buffer_length = 0;
-
-			/* len should be a multiple of the block size now */
-			assert((len % SHA1_BLOCK_SIZE) == 0);
-
-			/* Set len to the number of blocks to be hashed */
-			len >>= SHA1_LOG2_BLOCK_SIZE;
-
-			if (len) {
-
-				ctx->job.buffer = (uint8_t *) buffer;
-				ctx->job.len = len;
-				ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
-										&ctx->job);
-				continue;
-			}
-		}
-
-		/*
-		 * If the extra blocks are not empty, then we are
-		 * either on the last block(s) or we need more
-		 * user input before continuing.
-		 */
-		if (ctx->status & HASH_CTX_STS_LAST) {
-
-			uint8_t *buf = ctx->partial_block_buffer;
-			uint32_t n_extra_blocks =
-					sha1_pad(buf, ctx->total_length);
-
-			ctx->status = (HASH_CTX_STS_PROCESSING |
-				       HASH_CTX_STS_COMPLETE);
-			ctx->job.buffer = buf;
-			ctx->job.len = (uint32_t) n_extra_blocks;
-			ctx = (struct sha1_hash_ctx *)
-				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
-			continue;
-		}
-
-		ctx->status = HASH_CTX_STS_IDLE;
-		return ctx;
-	}
-
-	return NULL;
-}
-
-static struct sha1_hash_ctx
-			*sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
-{
-	/*
-	 * If get_comp_job returns NULL, there are no jobs complete.
-	 * If get_comp_job returns a job, verify that it is safe to return to
-	 * the user.
-	 * If it is not ready, resubmit the job to finish processing.
-	 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
-	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
-	 * still need processing.
-	 */
-	struct sha1_hash_ctx *ctx;
-
-	ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
-	return sha1_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
-{
-	sha1_job_mgr_init(&mgr->mgr);
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
-					  struct sha1_hash_ctx *ctx,
-					  const void *buffer,
-					  uint32_t len,
-					  int flags)
-{
-	if (flags & (~HASH_ENTIRE)) {
-		/*
-		 * User should not pass anything other than FIRST, UPDATE, or
-		 * LAST
-		 */
-		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
-		return ctx;
-	}
-
-	if (ctx->status & HASH_CTX_STS_PROCESSING) {
-		/* Cannot submit to a currently processing job. */
-		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
-		return ctx;
-	}
-
-	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
-		/* Cannot update a finished job. */
-		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
-		return ctx;
-	}
-
-
-	if (flags & HASH_FIRST) {
-		/* Init digest */
-		sha1_init_digest(ctx->job.result_digest);
-
-		/* Reset byte counter */
-		ctx->total_length = 0;
-
-		/* Clear extra blocks */
-		ctx->partial_block_buffer_length = 0;
-	}
-
-	/*
-	 * If we made it here, there were no errors during this call to
-	 * submit
-	 */
-	ctx->error = HASH_CTX_ERROR_NONE;
-
-	/* Store buffer ptr info from user */
-	ctx->incoming_buffer = buffer;
-	ctx->incoming_buffer_length = len;
-
-	/*
-	 * Store the user's request flags and mark this ctx as currently
-	 * being processed.
-	 */
-	ctx->status = (flags & HASH_LAST) ?
-			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
-			HASH_CTX_STS_PROCESSING;
-
-	/* Advance byte counter */
-	ctx->total_length += len;
-
-	/*
-	 * If there is anything currently buffered in the extra blocks,
-	 * append to it until it contains a whole block.
-	 * Or if the user's buffer contains less than a whole block,
-	 * append as much as possible to the extra block.
-	 */
-	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
-		/*
-		 * Compute how many bytes to copy from user buffer into
-		 * extra block
-		 */
-		uint32_t copy_len = SHA1_BLOCK_SIZE -
-					ctx->partial_block_buffer_length;
-		if (len < copy_len)
-			copy_len = len;
-
-		if (copy_len) {
-			/* Copy and update relevant pointers and counters */
-			memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
-				buffer, copy_len);
-
-			ctx->partial_block_buffer_length += copy_len;
-			ctx->incoming_buffer = (const void *)
-					((const char *)buffer + copy_len);
-			ctx->incoming_buffer_length = len - copy_len;
-		}
-
-		/*
-		 * The extra block should never contain more than 1 block
-		 * here
-		 */
-		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
-
-		/*
-		 * If the extra block buffer contains exactly 1 block, it can
-		 * be hashed.
-		 */
-		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
-			ctx->partial_block_buffer_length = 0;
-
-			ctx->job.buffer = ctx->partial_block_buffer;
-			ctx->job.len = 1;
-			ctx = (struct sha1_hash_ctx *)
-				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
-		}
-	}
-
-	return sha1_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
-{
-	struct sha1_hash_ctx *ctx;
-
-	while (1) {
-		ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
-
-		/* If flush returned 0, there are no more jobs in flight. */
-		if (!ctx)
-			return NULL;
-
-		/*
-		 * If flush returned a job, resubmit the job to finish
-		 * processing.
-		 */
-		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
-
-		/*
-		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
-		 * returned. Otherwise, all jobs currently being managed by the
-		 * sha1_ctx_mgr still need processing. Loop.
-		 */
-		if (ctx)
-			return ctx;
-	}
-}
-
-static int sha1_mb_init(struct ahash_request *areq)
-{
-	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
-	hash_ctx_init(sctx);
-	sctx->job.result_digest[0] = SHA1_H0;
-	sctx->job.result_digest[1] = SHA1_H1;
-	sctx->job.result_digest[2] = SHA1_H2;
-	sctx->job.result_digest[3] = SHA1_H3;
-	sctx->job.result_digest[4] = SHA1_H4;
-	sctx->total_length = 0;
-	sctx->partial_block_buffer_length = 0;
-	sctx->status = HASH_CTX_STS_IDLE;
-
-	return 0;
-}
-
-static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
-{
-	int	i;
-	struct	sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
-	__be32	*dst = (__be32 *) rctx->out;
-
-	for (i = 0; i < 5; ++i)
-		dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
-
-	return 0;
-}
-
-static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
-			struct mcryptd_alg_cstate *cstate, bool flush)
-{
-	int	flag = HASH_UPDATE;
-	int	nbytes, err = 0;
-	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
-	struct sha1_hash_ctx *sha_ctx;
-
-	/* more work ? */
-	while (!(rctx->flag & HASH_DONE)) {
-		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
-		if (nbytes < 0) {
-			err = nbytes;
-			goto out;
-		}
-		/* check if the walk is done */
-		if (crypto_ahash_walk_last(&rctx->walk)) {
-			rctx->flag |= HASH_DONE;
-			if (rctx->flag & HASH_FINAL)
-				flag |= HASH_LAST;
-
-		}
-		sha_ctx = (struct sha1_hash_ctx *)
-						ahash_request_ctx(&rctx->areq);
-		kernel_fpu_begin();
-		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
-						rctx->walk.data, nbytes, flag);
-		if (!sha_ctx) {
-			if (flush)
-				sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
-		}
-		kernel_fpu_end();
-		if (sha_ctx)
-			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-		else {
-			rctx = NULL;
-			goto out;
-		}
-	}
-
-	/* copy the results */
-	if (rctx->flag & HASH_FINAL)
-		sha1_mb_set_results(rctx);
-
-out:
-	*ret_rctx = rctx;
-	return err;
-}
-
-static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
-			    struct mcryptd_alg_cstate *cstate,
-			    int err)
-{
-	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
-	struct sha1_hash_ctx *sha_ctx;
-	struct mcryptd_hash_request_ctx *req_ctx;
-	int ret;
-
-	/* remove from work list */
-	spin_lock(&cstate->work_lock);
-	list_del(&rctx->waiter);
-	spin_unlock(&cstate->work_lock);
-
-	if (irqs_disabled())
-		rctx->complete(&req->base, err);
-	else {
-		local_bh_disable();
-		rctx->complete(&req->base, err);
-		local_bh_enable();
-	}
-
-	/* check to see if there are other jobs that are done */
-	sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
-	while (sha_ctx) {
-		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-		ret = sha_finish_walk(&req_ctx, cstate, false);
-		if (req_ctx) {
-			spin_lock(&cstate->work_lock);
-			list_del(&req_ctx->waiter);
-			spin_unlock(&cstate->work_lock);
-
-			req = cast_mcryptd_ctx_to_req(req_ctx);
-			if (irqs_disabled())
-				req_ctx->complete(&req->base, ret);
-			else {
-				local_bh_disable();
-				req_ctx->complete(&req->base, ret);
-				local_bh_enable();
-			}
-		}
-		sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
-	}
-
-	return 0;
-}
-
-static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
-			     struct mcryptd_alg_cstate *cstate)
-{
-	unsigned long next_flush;
-	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
-
-	/* initialize tag */
-	rctx->tag.arrival = jiffies;    /* tag the arrival time */
-	rctx->tag.seq_num = cstate->next_seq_num++;
-	next_flush = rctx->tag.arrival + delay;
-	rctx->tag.expire = next_flush;
-
-	spin_lock(&cstate->work_lock);
-	list_add_tail(&rctx->waiter, &cstate->work_list);
-	spin_unlock(&cstate->work_lock);
-
-	mcryptd_arm_flusher(cstate, delay);
-}
-
-static int sha1_mb_update(struct ahash_request *areq)
-{
-	struct mcryptd_hash_request_ctx *rctx =
-		container_of(areq, struct mcryptd_hash_request_ctx, areq);
-	struct mcryptd_alg_cstate *cstate =
-				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
-	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
-	struct sha1_hash_ctx *sha_ctx;
-	int ret = 0, nbytes;
-
-
-	/* sanity check */
-	if (rctx->tag.cpu != smp_processor_id()) {
-		pr_err("mcryptd error: cpu clash\n");
-		goto done;
-	}
-
-	/* need to init context */
-	req_ctx_init(rctx, areq);
-
-	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
-	if (nbytes < 0) {
-		ret = nbytes;
-		goto done;
-	}
-
-	if (crypto_ahash_walk_last(&rctx->walk))
-		rctx->flag |= HASH_DONE;
-
-	/* submit */
-	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
-	sha1_mb_add_list(rctx, cstate);
-	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
-							nbytes, HASH_UPDATE);
-	kernel_fpu_end();
-
-	/* check if anything is returned */
-	if (!sha_ctx)
-		return -EINPROGRESS;
-
-	if (sha_ctx->error) {
-		ret = sha_ctx->error;
-		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-		goto done;
-	}
-
-	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-	ret = sha_finish_walk(&rctx, cstate, false);
-
-	if (!rctx)
-		return -EINPROGRESS;
-done:
-	sha_complete_job(rctx, cstate, ret);
-	return ret;
-}
-
-static int sha1_mb_finup(struct ahash_request *areq)
-{
-	struct mcryptd_hash_request_ctx *rctx =
-		container_of(areq, struct mcryptd_hash_request_ctx, areq);
-	struct mcryptd_alg_cstate *cstate =
-				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
-	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
-	struct sha1_hash_ctx *sha_ctx;
-	int ret = 0, flag = HASH_UPDATE, nbytes;
-
-	/* sanity check */
-	if (rctx->tag.cpu != smp_processor_id()) {
-		pr_err("mcryptd error: cpu clash\n");
-		goto done;
-	}
-
-	/* need to init context */
-	req_ctx_init(rctx, areq);
-
-	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
-	if (nbytes < 0) {
-		ret = nbytes;
-		goto done;
-	}
-
-	if (crypto_ahash_walk_last(&rctx->walk)) {
-		rctx->flag |= HASH_DONE;
-		flag = HASH_LAST;
-	}
-
-	/* submit */
-	rctx->flag |= HASH_FINAL;
-	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
-	sha1_mb_add_list(rctx, cstate);
-
-	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
-								nbytes, flag);
-	kernel_fpu_end();
-
-	/* check if anything is returned */
-	if (!sha_ctx)
-		return -EINPROGRESS;
-
-	if (sha_ctx->error) {
-		ret = sha_ctx->error;
-		goto done;
-	}
-
-	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-	ret = sha_finish_walk(&rctx, cstate, false);
-	if (!rctx)
-		return -EINPROGRESS;
-done:
-	sha_complete_job(rctx, cstate, ret);
-	return ret;
-}
-
-static int sha1_mb_final(struct ahash_request *areq)
-{
-	struct mcryptd_hash_request_ctx *rctx =
-		container_of(areq, struct mcryptd_hash_request_ctx, areq);
-	struct mcryptd_alg_cstate *cstate =
-				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
-	struct sha1_hash_ctx *sha_ctx;
-	int ret = 0;
-	u8 data;
-
-	/* sanity check */
-	if (rctx->tag.cpu != smp_processor_id()) {
-		pr_err("mcryptd error: cpu clash\n");
-		goto done;
-	}
-
-	/* need to init context */
-	req_ctx_init(rctx, areq);
-
-	rctx->flag |= HASH_DONE | HASH_FINAL;
-
-	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
-	/* flag HASH_FINAL and 0 data size */
-	sha1_mb_add_list(rctx, cstate);
-	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
-								HASH_LAST);
-	kernel_fpu_end();
-
-	/* check if anything is returned */
-	if (!sha_ctx)
-		return -EINPROGRESS;
-
-	if (sha_ctx->error) {
-		ret = sha_ctx->error;
-		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-		goto done;
-	}
-
-	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-	ret = sha_finish_walk(&rctx, cstate, false);
-	if (!rctx)
-		return -EINPROGRESS;
-done:
-	sha_complete_job(rctx, cstate, ret);
-	return ret;
-}
-
-static int sha1_mb_export(struct ahash_request *areq, void *out)
-{
-	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha1_mb_import(struct ahash_request *areq, const void *in)
-{
-	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
-	struct mcryptd_ahash *mcryptd_tfm;
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct mcryptd_hash_ctx *mctx;
-
-	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
-						CRYPTO_ALG_INTERNAL,
-						CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(mcryptd_tfm))
-		return PTR_ERR(mcryptd_tfm);
-	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
-	mctx->alg_state = &sha1_mb_alg_state;
-	ctx->mcryptd_tfm = mcryptd_tfm;
-	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				sizeof(struct ahash_request) +
-				crypto_ahash_reqsize(&mcryptd_tfm->base));
-
-	return 0;
-}
-
-static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
-{
-	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				sizeof(struct ahash_request) +
-				sizeof(struct sha1_hash_ctx));
-
-	return 0;
-}
-
-static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
-{
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static struct ahash_alg sha1_mb_areq_alg = {
-	.init		=	sha1_mb_init,
-	.update		=	sha1_mb_update,
-	.final		=	sha1_mb_final,
-	.finup		=	sha1_mb_finup,
-	.export		=	sha1_mb_export,
-	.import		=	sha1_mb_import,
-	.halg		=	{
-		.digestsize	=	SHA1_DIGEST_SIZE,
-		.statesize	=	sizeof(struct sha1_hash_ctx),
-		.base		=	{
-			.cra_name	 = "__sha1-mb",
-			.cra_driver_name = "__intel_sha1-mb",
-			.cra_priority	 = 100,
-			/*
-			 * use ASYNC flag as some buffers in multi-buffer
-			 * algo may not have completed before hashing thread
-			 * sleep
-			 */
-			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
-						CRYPTO_ALG_ASYNC |
-						CRYPTO_ALG_INTERNAL,
-			.cra_blocksize	= SHA1_BLOCK_SIZE,
-			.cra_module	= THIS_MODULE,
-			.cra_list	= LIST_HEAD_INIT
-					(sha1_mb_areq_alg.halg.base.cra_list),
-			.cra_init	= sha1_mb_areq_init_tfm,
-			.cra_exit	= sha1_mb_areq_exit_tfm,
-			.cra_ctxsize	= sizeof(struct sha1_hash_ctx),
-		}
-	}
-};
-
-static int sha1_mb_async_init(struct ahash_request *req)
-{
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_init(mcryptd_req);
-}
-
-static int sha1_mb_async_update(struct ahash_request *req)
-{
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_update(mcryptd_req);
-}
-
-static int sha1_mb_async_finup(struct ahash_request *req)
-{
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_finup(mcryptd_req);
-}
-
-static int sha1_mb_async_final(struct ahash_request *req)
-{
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_final(mcryptd_req);
-}
-
-static int sha1_mb_async_digest(struct ahash_request *req)
-{
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_digest(mcryptd_req);
-}
-
-static int sha1_mb_async_export(struct ahash_request *req, void *out)
-{
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	return crypto_ahash_export(mcryptd_req, out);
-}
-
-static int sha1_mb_async_import(struct ahash_request *req, const void *in)
-{
-	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
-	struct mcryptd_hash_request_ctx *rctx;
-	struct ahash_request *areq;
-
-	memcpy(mcryptd_req, req, sizeof(*req));
-	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
-	rctx = ahash_request_ctx(mcryptd_req);
-	areq = &rctx->areq;
-
-	ahash_request_set_tfm(areq, child);
-	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
-					rctx->complete, req);
-
-	return crypto_ahash_import(mcryptd_req, in);
-}
-
-static struct ahash_alg sha1_mb_async_alg = {
-	.init           = sha1_mb_async_init,
-	.update         = sha1_mb_async_update,
-	.final          = sha1_mb_async_final,
-	.finup          = sha1_mb_async_finup,
-	.digest         = sha1_mb_async_digest,
-	.export		= sha1_mb_async_export,
-	.import		= sha1_mb_async_import,
-	.halg = {
-		.digestsize     = SHA1_DIGEST_SIZE,
-		.statesize	= sizeof(struct sha1_hash_ctx),
-		.base = {
-			.cra_name               = "sha1",
-			.cra_driver_name        = "sha1_mb",
-			.cra_priority           = 200,
-			.cra_flags              = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
-			.cra_blocksize          = SHA1_BLOCK_SIZE,
-			.cra_type               = &crypto_ahash_type,
-			.cra_module             = THIS_MODULE,
-			.cra_list               = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
-			.cra_init               = sha1_mb_async_init_tfm,
-			.cra_exit               = sha1_mb_async_exit_tfm,
-			.cra_ctxsize		= sizeof(struct sha1_mb_ctx),
-			.cra_alignmask		= 0,
-		},
-	},
-};
-
-static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
-{
-	struct mcryptd_hash_request_ctx *rctx;
-	unsigned long cur_time;
-	unsigned long next_flush = 0;
-	struct sha1_hash_ctx *sha_ctx;
-
-
-	cur_time = jiffies;
-
-	while (!list_empty(&cstate->work_list)) {
-		rctx = list_entry(cstate->work_list.next,
-				struct mcryptd_hash_request_ctx, waiter);
-		if (time_before(cur_time, rctx->tag.expire))
-			break;
-		kernel_fpu_begin();
-		sha_ctx = (struct sha1_hash_ctx *)
-					sha1_ctx_mgr_flush(cstate->mgr);
-		kernel_fpu_end();
-		if (!sha_ctx) {
-			pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
-			break;
-		}
-		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
-		sha_finish_walk(&rctx, cstate, true);
-		sha_complete_job(rctx, cstate, 0);
-	}
-
-	if (!list_empty(&cstate->work_list)) {
-		rctx = list_entry(cstate->work_list.next,
-				struct mcryptd_hash_request_ctx, waiter);
-		/* get the hash context and then flush time */
-		next_flush = rctx->tag.expire;
-		mcryptd_arm_flusher(cstate, get_delay(next_flush));
-	}
-	return next_flush;
-}
-
-static int __init sha1_mb_mod_init(void)
-{
-
-	int cpu;
-	int err;
-	struct mcryptd_alg_cstate *cpu_state;
-
-	/* check for dependent cpu features */
-	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
-	    !boot_cpu_has(X86_FEATURE_BMI2))
-		return -ENODEV;
-
-	/* initialize multibuffer structures */
-	sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
-
-	sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
-	sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
-	sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
-	sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
-
-	if (!sha1_mb_alg_state.alg_cstate)
-		return -ENOMEM;
-	for_each_possible_cpu(cpu) {
-		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
-		cpu_state->next_flush = 0;
-		cpu_state->next_seq_num = 0;
-		cpu_state->flusher_engaged = false;
-		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
-		cpu_state->cpu = cpu;
-		cpu_state->alg_state = &sha1_mb_alg_state;
-		cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
-					GFP_KERNEL);
-		if (!cpu_state->mgr)
-			goto err2;
-		sha1_ctx_mgr_init(cpu_state->mgr);
-		INIT_LIST_HEAD(&cpu_state->work_list);
-		spin_lock_init(&cpu_state->work_lock);
-	}
-	sha1_mb_alg_state.flusher = &sha1_mb_flusher;
-
-	err = crypto_register_ahash(&sha1_mb_areq_alg);
-	if (err)
-		goto err2;
-	err = crypto_register_ahash(&sha1_mb_async_alg);
-	if (err)
-		goto err1;
-
-
-	return 0;
-err1:
-	crypto_unregister_ahash(&sha1_mb_areq_alg);
-err2:
-	for_each_possible_cpu(cpu) {
-		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
-		kfree(cpu_state->mgr);
-	}
-	free_percpu(sha1_mb_alg_state.alg_cstate);
-	return -ENODEV;
-}
-
-static void __exit sha1_mb_mod_fini(void)
-{
-	int cpu;
-	struct mcryptd_alg_cstate *cpu_state;
-
-	crypto_unregister_ahash(&sha1_mb_async_alg);
-	crypto_unregister_ahash(&sha1_mb_areq_alg);
-	for_each_possible_cpu(cpu) {
-		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
-		kfree(cpu_state->mgr);
-	}
-	free_percpu(sha1_mb_alg_state.alg_cstate);
-}
-
-module_init(sha1_mb_mod_init);
-module_exit(sha1_mb_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
deleted file mode 100644
index 86688c6e7a25..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Header file for multi buffer SHA1 algorithm data structure
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *      James Guilford <james.guilford@intel.com>
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-# Macros for defining data structures
-
-# Usage example
-
-#START_FIELDS	# JOB_AES
-###	name		size	align
-#FIELD	_plaintext,	8,	8	# pointer to plaintext
-#FIELD	_ciphertext,	8,	8	# pointer to ciphertext
-#FIELD	_IV,		16,	8	# IV
-#FIELD	_keys,		8,	8	# pointer to keys
-#FIELD	_len,		4,	4	# length in bytes
-#FIELD	_status,	4,	4	# status enumeration
-#FIELD	_user_data,	8,	8	# pointer to user data
-#UNION  _union,         size1,  align1, \
-#	                size2,  align2, \
-#	                size3,  align3, \
-#	                ...
-#END_FIELDS
-#%assign _JOB_AES_size	_FIELD_OFFSET
-#%assign _JOB_AES_align	_STRUCT_ALIGN
-
-#########################################################################
-
-# Alternate "struc-like" syntax:
-#	STRUCT job_aes2
-#	RES_Q	.plaintext,	1
-#	RES_Q	.ciphertext,	1
-#	RES_DQ	.IV,		1
-#	RES_B	.nested,	_JOB_AES_SIZE, _JOB_AES_ALIGN
-#	RES_U	.union,		size1, align1, \
-#				size2, align2, \
-#				...
-#	ENDSTRUCT
-#	# Following only needed if nesting
-#	%assign job_aes2_size	_FIELD_OFFSET
-#	%assign job_aes2_align	_STRUCT_ALIGN
-#
-# RES_* macros take a name, a count and an optional alignment.
-# The count in in terms of the base size of the macro, and the
-# default alignment is the base size.
-# The macros are:
-# Macro    Base size
-# RES_B	    1
-# RES_W	    2
-# RES_D     4
-# RES_Q     8
-# RES_DQ   16
-# RES_Y    32
-# RES_Z    64
-#
-# RES_U defines a union. It's arguments are a name and two or more
-# pairs of "size, alignment"
-#
-# The two assigns are only needed if this structure is being nested
-# within another. Even if the assigns are not done, one can still use
-# STRUCT_NAME_size as the size of the structure.
-#
-# Note that for nesting, you still need to assign to STRUCT_NAME_size.
-#
-# The differences between this and using "struc" directly are that each
-# type is implicitly aligned to its natural length (although this can be
-# over-ridden with an explicit third parameter), and that the structure
-# is padded at the end to its overall alignment.
-#
-
-#########################################################################
-
-#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
-#define _SHA1_MB_MGR_DATASTRUCT_ASM_
-
-## START_FIELDS
-.macro START_FIELDS
- _FIELD_OFFSET = 0
- _STRUCT_ALIGN = 0
-.endm
-
-## FIELD name size align
-.macro FIELD name size align
- _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
- \name	= _FIELD_OFFSET
- _FIELD_OFFSET = _FIELD_OFFSET + (\size)
-.if (\align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = \align
-.endif
-.endm
-
-## END_FIELDS
-.macro END_FIELDS
- _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
-.endm
-
-########################################################################
-
-.macro STRUCT p1
-START_FIELDS
-.struc \p1
-.endm
-
-.macro ENDSTRUCT
- tmp = _FIELD_OFFSET
- END_FIELDS
- tmp = (_FIELD_OFFSET - %%tmp)
-.if (tmp > 0)
-	.lcomm	tmp
-.endif
-.endstruc
-.endm
-
-## RES_int name size align
-.macro RES_int p1 p2 p3
- name = \p1
- size = \p2
- align = .\p3
-
- _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
-.align align
-.lcomm name size
- _FIELD_OFFSET = _FIELD_OFFSET + (size)
-.if (align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = align
-.endif
-.endm
-
-
-
-# macro RES_B name, size [, align]
-.macro RES_B _name, _size, _align=1
-RES_int _name _size _align
-.endm
-
-# macro RES_W name, size [, align]
-.macro RES_W _name, _size, _align=2
-RES_int _name 2*(_size) _align
-.endm
-
-# macro RES_D name, size [, align]
-.macro RES_D _name, _size, _align=4
-RES_int _name 4*(_size) _align
-.endm
-
-# macro RES_Q name, size [, align]
-.macro RES_Q _name, _size, _align=8
-RES_int _name 8*(_size) _align
-.endm
-
-# macro RES_DQ name, size [, align]
-.macro RES_DQ _name, _size, _align=16
-RES_int _name 16*(_size) _align
-.endm
-
-# macro RES_Y name, size [, align]
-.macro RES_Y _name, _size, _align=32
-RES_int _name 32*(_size) _align
-.endm
-
-# macro RES_Z name, size [, align]
-.macro RES_Z _name, _size, _align=64
-RES_int _name 64*(_size) _align
-.endm
-
-
-#endif
-
-########################################################################
-#### Define constants
-########################################################################
-
-########################################################################
-#### Define SHA1 Out Of Order Data Structures
-########################################################################
-
-START_FIELDS    # LANE_DATA
-###     name            size    align
-FIELD   _job_in_lane,   8,      8       # pointer to job object
-END_FIELDS
-
-_LANE_DATA_size = _FIELD_OFFSET
-_LANE_DATA_align = _STRUCT_ALIGN
-
-########################################################################
-
-START_FIELDS    # SHA1_ARGS_X8
-###     name            size    align
-FIELD   _digest,        4*5*8,  16      # transposed digest
-FIELD   _data_ptr,      8*8,    8       # array of pointers to data
-END_FIELDS
-
-_SHA1_ARGS_X4_size =     _FIELD_OFFSET
-_SHA1_ARGS_X4_align =    _STRUCT_ALIGN
-_SHA1_ARGS_X8_size =     _FIELD_OFFSET
-_SHA1_ARGS_X8_align =    _STRUCT_ALIGN
-
-########################################################################
-
-START_FIELDS    # MB_MGR
-###     name            size    align
-FIELD   _args,          _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
-FIELD   _lens,          4*8,    8
-FIELD   _unused_lanes,  8,      8
-FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
-END_FIELDS
-
-_MB_MGR_size =   _FIELD_OFFSET
-_MB_MGR_align =  _STRUCT_ALIGN
-
-_args_digest    =     _args + _digest
-_args_data_ptr  =     _args + _data_ptr
-
-
-########################################################################
-#### Define constants
-########################################################################
-
-#define STS_UNKNOWN             0
-#define STS_BEING_PROCESSED     1
-#define STS_COMPLETED           2
-
-########################################################################
-#### Define JOB_SHA1 structure
-########################################################################
-
-START_FIELDS    # JOB_SHA1
-
-###     name                            size    align
-FIELD   _buffer,                        8,      8       # pointer to buffer
-FIELD   _len,                           4,      4       # length in bytes
-FIELD   _result_digest,                 5*4,    32      # Digest (output)
-FIELD   _status,                        4,      4
-FIELD   _user_data,                     8,      8
-END_FIELDS
-
-_JOB_SHA1_size =  _FIELD_OFFSET
-_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
deleted file mode 100644
index 96df6a39d7e2..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Flush routine for SHA1 multibuffer
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *      James Guilford <james.guilford@intel.com>
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-
-.extern sha1_x8_avx2
-
-# LINUX register definitions
-#define arg1    %rdi
-#define arg2    %rsi
-
-# Common definitions
-#define state   arg1
-#define job     arg2
-#define len2    arg2
-
-# idx must be a register not clobbered by sha1_x8_avx2
-#define idx		%r8
-#define DWORD_idx	%r8d
-
-#define unused_lanes    %rbx
-#define lane_data       %rbx
-#define tmp2            %rbx
-#define tmp2_w		%ebx
-
-#define job_rax         %rax
-#define tmp1            %rax
-#define size_offset     %rax
-#define tmp             %rax
-#define start_offset    %rax
-
-#define tmp3            %arg1
-
-#define extra_blocks    %arg2
-#define p               %arg2
-
-.macro LABEL prefix n
-\prefix\n\():
-.endm
-
-.macro JNE_SKIP i
-jne     skip_\i
-.endm
-
-.altmacro
-.macro SET_OFFSET _offset
-offset = \_offset
-.endm
-.noaltmacro
-
-# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
-# arg 1 : rcx : state
-ENTRY(sha1_mb_mgr_flush_avx2)
-	FRAME_BEGIN
-	push	%rbx
-
-	# If bit (32+3) is set, then all lanes are empty
-	mov     _unused_lanes(state), unused_lanes
-	bt      $32+3, unused_lanes
-	jc      return_null
-
-	# find a lane with a non-null job
-	xor     idx, idx
-	offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  one(%rip), idx
-	offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  two(%rip), idx
-	offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  three(%rip), idx
-	offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  four(%rip), idx
-	offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  five(%rip), idx
-	offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  six(%rip), idx
-	offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-	cmovne  seven(%rip), idx
-
-	# copy idx to empty lanes
-copy_lane_data:
-	offset =  (_args + _data_ptr)
-	mov     offset(state,idx,8), tmp
-
-	I = 0
-.rep 8
-	offset =  (_ldata + I * _LANE_DATA_size + _job_in_lane)
-	cmpq    $0, offset(state)
-.altmacro
-	JNE_SKIP %I
-	offset =  (_args + _data_ptr + 8*I)
-	mov     tmp, offset(state)
-	offset =  (_lens + 4*I)
-	movl    $0xFFFFFFFF, offset(state)
-LABEL skip_ %I
-	I = (I+1)
-.noaltmacro
-.endr
-
-	# Find min length
-	vmovdqa _lens+0*16(state), %xmm0
-	vmovdqa _lens+1*16(state), %xmm1
-
-	vpminud %xmm1, %xmm0, %xmm2     # xmm2 has {D,C,B,A}
-	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
-	vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
-
-	vmovd   %xmm2, DWORD_idx
-	mov	idx, len2
-	and	$0xF, idx
-	shr	$4, len2
-	jz	len_is_0
-
-	vpand   clear_low_nibble(%rip), %xmm2, %xmm2
-	vpshufd $0, %xmm2, %xmm2
-
-	vpsubd  %xmm2, %xmm0, %xmm0
-	vpsubd  %xmm2, %xmm1, %xmm1
-
-	vmovdqa %xmm0, _lens+0*16(state)
-	vmovdqa %xmm1, _lens+1*16(state)
-
-	# "state" and "args" are the same address, arg1
-	# len is arg2
-	call	sha1_x8_avx2
-	# state and idx are intact
-
-
-len_is_0:
-	# process completed job "idx"
-	imul    $_LANE_DATA_size, idx, lane_data
-	lea     _ldata(state, lane_data), lane_data
-
-	mov     _job_in_lane(lane_data), job_rax
-	movq    $0, _job_in_lane(lane_data)
-	movl    $STS_COMPLETED, _status(job_rax)
-	mov     _unused_lanes(state), unused_lanes
-	shl     $4, unused_lanes
-	or      idx, unused_lanes
-	mov     unused_lanes, _unused_lanes(state)
-
-	movl	$0xFFFFFFFF, _lens(state, idx, 4)
-
-	vmovd    _args_digest(state , idx, 4) , %xmm0
-	vpinsrd  $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
-	vpinsrd  $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
-	vpinsrd  $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
-	movl    _args_digest+4*32(state, idx, 4), tmp2_w
-
-	vmovdqu  %xmm0, _result_digest(job_rax)
-	offset =  (_result_digest + 1*16)
-	mov     tmp2_w, offset(job_rax)
-
-return:
-	pop	%rbx
-	FRAME_END
-	ret
-
-return_null:
-	xor     job_rax, job_rax
-	jmp     return
-ENDPROC(sha1_mb_mgr_flush_avx2)
-
-
-#################################################################
-
-.align 16
-ENTRY(sha1_mb_mgr_get_comp_job_avx2)
-	push    %rbx
-
-	## if bit 32+3 is set, then all lanes are empty
-	mov     _unused_lanes(state), unused_lanes
-	bt      $(32+3), unused_lanes
-	jc      .return_null
-
-	# Find min length
-	vmovdqa _lens(state), %xmm0
-	vmovdqa _lens+1*16(state), %xmm1
-
-	vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
-	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
-	vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
-
-	vmovd   %xmm2, DWORD_idx
-	test    $~0xF, idx
-	jnz     .return_null
-
-	# process completed job "idx"
-	imul    $_LANE_DATA_size, idx, lane_data
-	lea     _ldata(state, lane_data), lane_data
-
-	mov     _job_in_lane(lane_data), job_rax
-	movq    $0,  _job_in_lane(lane_data)
-	movl    $STS_COMPLETED, _status(job_rax)
-	mov     _unused_lanes(state), unused_lanes
-	shl     $4, unused_lanes
-	or      idx, unused_lanes
-	mov     unused_lanes, _unused_lanes(state)
-
-	movl    $0xFFFFFFFF, _lens(state,  idx, 4)
-
-	vmovd   _args_digest(state, idx, 4), %xmm0
-	vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
-	vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
-	vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
-	movl    _args_digest+4*32(state, idx, 4), tmp2_w
-
-	vmovdqu %xmm0, _result_digest(job_rax)
-	movl    tmp2_w, _result_digest+1*16(job_rax)
-
-	pop     %rbx
-
-	ret
-
-.return_null:
-	xor     job_rax, job_rax
-	pop     %rbx
-	ret
-ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
-
-.data
-
-.align 16
-clear_low_nibble:
-.octa	0x000000000000000000000000FFFFFFF0
-one:
-.quad  1
-two:
-.quad  2
-three:
-.quad  3
-four:
-.quad  4
-five:
-.quad  5
-six:
-.quad  6
-seven:
-.quad  7
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
deleted file mode 100644
index 822acb5b464c..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Initialization code for multi buffer SHA1 algorithm for AVX2
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "sha_mb_mgr.h"
-
-void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
-{
-	unsigned int j;
-	state->unused_lanes = 0xF76543210ULL;
-	for (j = 0; j < 8; j++) {
-		state->lens[j] = 0xFFFFFFFF;
-		state->ldata[j].job_in_lane = NULL;
-	}
-}
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
deleted file mode 100644
index 63a0d9c8e31f..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Buffer submit code for multi buffer SHA1 algorithm
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *      James Guilford <james.guilford@intel.com>
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-
-.extern sha1_x8_avx
-
-# LINUX register definitions
-arg1    = %rdi
-arg2    = %rsi
-size_offset	= %rcx
-tmp2		= %rcx
-extra_blocks	= %rdx
-
-# Common definitions
-#define state   arg1
-#define job     %rsi
-#define len2    arg2
-#define p2      arg2
-
-# idx must be a register not clobberred by sha1_x8_avx2
-idx		= %r8
-DWORD_idx	= %r8d
-last_len	= %r8
-
-p               = %r11
-start_offset    = %r11
-
-unused_lanes    = %rbx
-BYTE_unused_lanes = %bl
-
-job_rax         = %rax
-len             = %rax
-DWORD_len	= %eax
-
-lane            = %r12
-tmp3            = %r12
-
-tmp             = %r9
-DWORD_tmp	= %r9d
-
-lane_data       = %r10
-
-# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
-# arg 1 : rcx : state
-# arg 2 : rdx : job
-ENTRY(sha1_mb_mgr_submit_avx2)
-	FRAME_BEGIN
-	push	%rbx
-	push	%r12
-
-	mov     _unused_lanes(state), unused_lanes
-	mov	unused_lanes, lane
-	and	$0xF, lane
-	shr     $4, unused_lanes
-	imul    $_LANE_DATA_size, lane, lane_data
-	movl    $STS_BEING_PROCESSED, _status(job)
-	lea     _ldata(state, lane_data), lane_data
-	mov     unused_lanes, _unused_lanes(state)
-	movl    _len(job),  DWORD_len
-
-	mov	job, _job_in_lane(lane_data)
-	shl	$4, len
-	or	lane, len
-
-	movl    DWORD_len,  _lens(state , lane, 4)
-
-	# Load digest words from result_digest
-	vmovdqu	_result_digest(job), %xmm0
-	mov	_result_digest+1*16(job), DWORD_tmp
-	vmovd    %xmm0, _args_digest(state, lane, 4)
-	vpextrd  $1, %xmm0, _args_digest+1*32(state , lane, 4)
-	vpextrd  $2, %xmm0, _args_digest+2*32(state , lane, 4)
-	vpextrd  $3, %xmm0, _args_digest+3*32(state , lane, 4)
-	movl    DWORD_tmp, _args_digest+4*32(state , lane, 4)
-
-	mov     _buffer(job), p
-	mov     p, _args_data_ptr(state, lane, 8)
-
-	cmp     $0xF, unused_lanes
-	jne     return_null
-
-start_loop:
-	# Find min length
-	vmovdqa _lens(state), %xmm0
-	vmovdqa _lens+1*16(state), %xmm1
-
-	vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
-	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
-	vpalignr $4, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,x,E}
-	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
-
-	vmovd   %xmm2, DWORD_idx
-	mov    idx, len2
-	and    $0xF, idx
-	shr    $4, len2
-	jz     len_is_0
-
-	vpand   clear_low_nibble(%rip), %xmm2, %xmm2
-	vpshufd $0, %xmm2, %xmm2
-
-	vpsubd  %xmm2, %xmm0, %xmm0
-	vpsubd  %xmm2, %xmm1, %xmm1
-
-	vmovdqa %xmm0, _lens + 0*16(state)
-	vmovdqa %xmm1, _lens + 1*16(state)
-
-
-	# "state" and "args" are the same address, arg1
-	# len is arg2
-	call    sha1_x8_avx2
-
-	# state and idx are intact
-
-len_is_0:
-	# process completed job "idx"
-	imul    $_LANE_DATA_size, idx, lane_data
-	lea     _ldata(state, lane_data), lane_data
-
-	mov     _job_in_lane(lane_data), job_rax
-	mov     _unused_lanes(state), unused_lanes
-	movq    $0, _job_in_lane(lane_data)
-	movl    $STS_COMPLETED, _status(job_rax)
-	shl     $4, unused_lanes
-	or      idx, unused_lanes
-	mov     unused_lanes, _unused_lanes(state)
-
-	movl	$0xFFFFFFFF, _lens(state, idx, 4)
-
-	vmovd    _args_digest(state, idx, 4), %xmm0
-	vpinsrd  $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
-	vpinsrd  $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
-	vpinsrd  $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
-	movl     _args_digest+4*32(state, idx, 4), DWORD_tmp
-
-	vmovdqu  %xmm0, _result_digest(job_rax)
-	movl    DWORD_tmp, _result_digest+1*16(job_rax)
-
-return:
-	pop	%r12
-	pop	%rbx
-	FRAME_END
-	ret
-
-return_null:
-	xor     job_rax, job_rax
-	jmp     return
-
-ENDPROC(sha1_mb_mgr_submit_avx2)
-
-.data
-
-.align 16
-clear_low_nibble:
-	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
deleted file mode 100644
index c9dae1cd2919..000000000000
--- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Multi-buffer SHA1 algorithm hash compute routine
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *      James Guilford <james.guilford@intel.com>
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-## code to compute oct SHA1 using SSE-256
-## outer calling routine takes care of save and restore of XMM registers
-
-## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15# ymm0-15
-##
-## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
-## Linux preserves:                       rdi rbp r8
-##
-## clobbers ymm0-15
-
-
-# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-# "transpose" data in {r0...r7} using temps {t0...t1}
-# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
-# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
-# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
-# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
-# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
-# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
-# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
-# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
-#
-# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
-# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
-# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
-# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
-# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
-# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
-# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
-# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
-#
-
-.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
-	# process top half (r0..r3) {a...d}
-	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
-	vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
-	vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
-	vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
-	vshufps  $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
-	vshufps  $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
-	vshufps  $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
-	vshufps  $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
-
-	# use r2 in place of t0
-	# process bottom half (r4..r7) {e...h}
-	vshufps  $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
-	vshufps  $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
-	vshufps  $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
-	vshufps  $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
-	vshufps  $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
-	vshufps  $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
-	vshufps  $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
-	vshufps  $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
-
-	vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6
-	vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2
-	vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5
-	vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1
-	vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7
-	vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3
-	vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4
-	vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0
-
-.endm
-##
-## Magic functions defined in FIPS 180-1
-##
-# macro MAGIC_F0 F,B,C,D,T   ## F = (D ^ (B & (C ^ D)))
-.macro MAGIC_F0 regF regB regC regD regT
-    vpxor \regD, \regC, \regF
-    vpand \regB, \regF, \regF
-    vpxor \regD, \regF, \regF
-.endm
-
-# macro MAGIC_F1 F,B,C,D,T   ## F = (B ^ C ^ D)
-.macro MAGIC_F1 regF regB regC regD regT
-    vpxor  \regC, \regD, \regF
-    vpxor  \regB, \regF, \regF
-.endm
-
-# macro MAGIC_F2 F,B,C,D,T   ## F = ((B & C) | (B & D) | (C & D))
-.macro MAGIC_F2 regF regB regC regD regT
-    vpor  \regC, \regB, \regF
-    vpand \regC, \regB, \regT
-    vpand \regD, \regF, \regF
-    vpor  \regT, \regF, \regF
-.endm
-
-# macro MAGIC_F3 F,B,C,D,T   ## F = (B ^ C ^ D)
-.macro MAGIC_F3 regF regB regC regD regT
-    MAGIC_F1 \regF,\regB,\regC,\regD,\regT
-.endm
-
-# PROLD reg, imm, tmp
-.macro PROLD reg imm tmp
-	vpsrld  $(32-\imm), \reg, \tmp
-	vpslld  $\imm, \reg, \reg
-	vpor    \tmp, \reg, \reg
-.endm
-
-.macro PROLD_nd reg imm tmp src
-	vpsrld  $(32-\imm), \src, \tmp
-	vpslld  $\imm, \src, \reg
-	vpor	\tmp, \reg, \reg
-.endm
-
-.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
-	vpaddd	\immCNT, \regE, \regE
-	vpaddd	\memW*32(%rsp), \regE, \regE
-	PROLD_nd \regT, 5, \regF, \regA
-	vpaddd	\regT, \regE, \regE
-	\MAGIC  \regF, \regB, \regC, \regD, \regT
-        PROLD   \regB, 30, \regT
-        vpaddd  \regF, \regE, \regE
-.endm
-
-.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
-	vpaddd	\immCNT, \regE, \regE
-	offset = ((\memW - 14) & 15) * 32
-	vmovdqu offset(%rsp), W14
-	vpxor	W14, W16, W16
-	offset = ((\memW -  8) & 15) * 32
-	vpxor	offset(%rsp), W16, W16
-	offset = ((\memW -  3) & 15) * 32
-	vpxor	offset(%rsp), W16, W16
-	vpsrld	$(32-1), W16, \regF
-	vpslld	$1, W16, W16
-	vpor	W16, \regF, \regF
-
-	ROTATE_W
-
-	offset = ((\memW - 0) & 15) * 32
-	vmovdqu	\regF, offset(%rsp)
-	vpaddd	\regF, \regE, \regE
-	PROLD_nd \regT, 5, \regF, \regA
-	vpaddd	\regT, \regE, \regE
-	\MAGIC \regF,\regB,\regC,\regD,\regT      ## FUN  = MAGIC_Fi(B,C,D)
-	PROLD   \regB,30, \regT
-	vpaddd  \regF, \regE, \regE
-.endm
-
-########################################################################
-########################################################################
-########################################################################
-
-## FRAMESZ plus pushes must be an odd multiple of 8
-YMM_SAVE = (15-15)*32
-FRAMESZ = 32*16 + YMM_SAVE
-_YMM  =   FRAMESZ - YMM_SAVE
-
-#define VMOVPS   vmovups
-
-IDX  = %rax
-inp0 = %r9
-inp1 = %r10
-inp2 = %r11
-inp3 = %r12
-inp4 = %r13
-inp5 = %r14
-inp6 = %r15
-inp7 = %rcx
-arg1 = %rdi
-arg2 = %rsi
-RSP_SAVE = %rdx
-
-# ymm0 A
-# ymm1 B
-# ymm2 C
-# ymm3 D
-# ymm4 E
-# ymm5         F       AA
-# ymm6         T0      BB
-# ymm7         T1      CC
-# ymm8         T2      DD
-# ymm9         T3      EE
-# ymm10                T4      TMP
-# ymm11                T5      FUN
-# ymm12                T6      K
-# ymm13                T7      W14
-# ymm14                T8      W15
-# ymm15                T9      W16
-
-
-A  =     %ymm0
-B  =     %ymm1
-C  =     %ymm2
-D  =     %ymm3
-E  =     %ymm4
-F  =     %ymm5
-T0 =	 %ymm6
-T1 =     %ymm7
-T2 =     %ymm8
-T3 =     %ymm9
-T4 =     %ymm10
-T5 =     %ymm11
-T6 =     %ymm12
-T7 =     %ymm13
-T8  =     %ymm14
-T9  =     %ymm15
-
-AA  =     %ymm5
-BB  =     %ymm6
-CC  =     %ymm7
-DD  =     %ymm8
-EE  =     %ymm9
-TMP =     %ymm10
-FUN =     %ymm11
-K   =     %ymm12
-W14 =     %ymm13
-W15 =     %ymm14
-W16 =     %ymm15
-
-.macro ROTATE_ARGS
- TMP_ = E
- E = D
- D = C
- C = B
- B = A
- A = TMP_
-.endm
-
-.macro ROTATE_W
-TMP_  = W16
-W16  = W15
-W15  = W14
-W14  = TMP_
-.endm
-
-# 8 streams x 5 32bit words per digest x 4 bytes per word
-#define DIGEST_SIZE (8*5*4)
-
-.align 32
-
-# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
-# arg 1 : pointer to array[4] of pointer to input data
-# arg 2 : size (in blocks) ;; assumed to be >= 1
-#
-ENTRY(sha1_x8_avx2)
-
-	# save callee-saved clobbered registers to comply with C function ABI
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-
-	#save rsp
-	mov	%rsp, RSP_SAVE
-	sub     $FRAMESZ, %rsp
-
-	#align rsp to 32 Bytes
-	and	$~0x1F, %rsp
-
-	## Initialize digests
-	vmovdqu  0*32(arg1), A
-	vmovdqu  1*32(arg1), B
-	vmovdqu  2*32(arg1), C
-	vmovdqu  3*32(arg1), D
-	vmovdqu  4*32(arg1), E
-
-	## transpose input onto stack
-	mov     _data_ptr+0*8(arg1),inp0
-	mov     _data_ptr+1*8(arg1),inp1
-	mov     _data_ptr+2*8(arg1),inp2
-	mov     _data_ptr+3*8(arg1),inp3
-	mov     _data_ptr+4*8(arg1),inp4
-	mov     _data_ptr+5*8(arg1),inp5
-	mov     _data_ptr+6*8(arg1),inp6
-	mov     _data_ptr+7*8(arg1),inp7
-
-	xor     IDX, IDX
-lloop:
-	vmovdqu  PSHUFFLE_BYTE_FLIP_MASK(%rip), F
-	I=0
-.rep 2
-	VMOVPS   (inp0, IDX), T0
-	VMOVPS   (inp1, IDX), T1
-	VMOVPS   (inp2, IDX), T2
-	VMOVPS   (inp3, IDX), T3
-	VMOVPS   (inp4, IDX), T4
-	VMOVPS   (inp5, IDX), T5
-	VMOVPS   (inp6, IDX), T6
-	VMOVPS   (inp7, IDX), T7
-
-	TRANSPOSE8       T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-	vpshufb  F, T0, T0
-	vmovdqu  T0, (I*8)*32(%rsp)
-	vpshufb  F, T1, T1
-	vmovdqu  T1, (I*8+1)*32(%rsp)
-	vpshufb  F, T2, T2
-	vmovdqu  T2, (I*8+2)*32(%rsp)
-	vpshufb  F, T3, T3
-	vmovdqu  T3, (I*8+3)*32(%rsp)
-	vpshufb  F, T4, T4
-	vmovdqu  T4, (I*8+4)*32(%rsp)
-	vpshufb  F, T5, T5
-	vmovdqu  T5, (I*8+5)*32(%rsp)
-	vpshufb  F, T6, T6
-	vmovdqu  T6, (I*8+6)*32(%rsp)
-	vpshufb  F, T7, T7
-	vmovdqu  T7, (I*8+7)*32(%rsp)
-	add     $32, IDX
-	I = (I+1)
-.endr
-	# save old digests
-	vmovdqu  A,AA
-	vmovdqu  B,BB
-	vmovdqu  C,CC
-	vmovdqu  D,DD
-	vmovdqu  E,EE
-
-##
-## perform 0-79 steps
-##
-	vmovdqu  K00_19(%rip), K
-## do rounds 0...15
-	I = 0
-.rep 16
-	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
-	ROTATE_ARGS
-	I = (I+1)
-.endr
-
-## do rounds 16...19
-	vmovdqu  ((16 - 16) & 15) * 32 (%rsp), W16
-	vmovdqu  ((16 - 15) & 15) * 32 (%rsp), W15
-.rep 4
-	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
-	ROTATE_ARGS
-	I = (I+1)
-.endr
-
-## do rounds 20...39
-	vmovdqu  K20_39(%rip), K
-.rep 20
-	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
-	ROTATE_ARGS
-	I = (I+1)
-.endr
-
-## do rounds 40...59
-	vmovdqu  K40_59(%rip), K
-.rep 20
-	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
-	ROTATE_ARGS
-	I = (I+1)
-.endr
-
-## do rounds 60...79
-	vmovdqu  K60_79(%rip), K
-.rep 20
-	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
-	ROTATE_ARGS
-	I = (I+1)
-.endr
-
-	vpaddd   AA,A,A
-	vpaddd   BB,B,B
-	vpaddd   CC,C,C
-	vpaddd   DD,D,D
-	vpaddd   EE,E,E
-
-	sub     $1, arg2
-	jne     lloop
-
-	# write out digests
-	vmovdqu  A, 0*32(arg1)
-	vmovdqu  B, 1*32(arg1)
-	vmovdqu  C, 2*32(arg1)
-	vmovdqu  D, 3*32(arg1)
-	vmovdqu  E, 4*32(arg1)
-
-	# update input pointers
-	add     IDX, inp0
-	add     IDX, inp1
-	add     IDX, inp2
-	add     IDX, inp3
-	add     IDX, inp4
-	add     IDX, inp5
-	add     IDX, inp6
-	add     IDX, inp7
-	mov     inp0, _data_ptr (arg1)
-	mov     inp1, _data_ptr + 1*8(arg1)
-	mov     inp2, _data_ptr + 2*8(arg1)
-	mov     inp3, _data_ptr + 3*8(arg1)
-	mov     inp4, _data_ptr + 4*8(arg1)
-	mov     inp5, _data_ptr + 5*8(arg1)
-	mov     inp6, _data_ptr + 6*8(arg1)
-	mov     inp7, _data_ptr + 7*8(arg1)
-
-	################
-	## Postamble
-
-	mov     RSP_SAVE, %rsp
-
-	# restore callee-saved clobbered registers
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-
-	ret
-ENDPROC(sha1_x8_avx2)
-
-
-.data
-
-.align 32
-K00_19:
-.octa 0x5A8279995A8279995A8279995A827999
-.octa 0x5A8279995A8279995A8279995A827999
-K20_39:
-.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
-.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
-K40_59:
-.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
-.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
-K60_79:
-.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
-.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
-PSHUFFLE_BYTE_FLIP_MASK:
-.octa 0x0c0d0e0f08090a0b0405060700010203
-.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
deleted file mode 100644
index e36069d0c1bd..000000000000
--- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Header file for multi buffer SHA context
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _SHA_MB_CTX_INTERNAL_H
-#define _SHA_MB_CTX_INTERNAL_H
-
-#include "sha_mb_mgr.h"
-
-#define HASH_UPDATE          0x00
-#define HASH_FIRST           0x01
-#define HASH_LAST            0x02
-#define HASH_ENTIRE          0x03
-#define HASH_DONE	     0x04
-#define HASH_FINAL	     0x08
-
-#define HASH_CTX_STS_IDLE       0x00
-#define HASH_CTX_STS_PROCESSING 0x01
-#define HASH_CTX_STS_LAST       0x02
-#define HASH_CTX_STS_COMPLETE   0x04
-
-enum hash_ctx_error {
-	HASH_CTX_ERROR_NONE               =  0,
-	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
-	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
-	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
-
-#ifdef HASH_CTX_DEBUG
-	HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
-#endif
-};
-
-
-#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
-#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
-#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
-#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
-#define hash_ctx_status(ctx)     ((ctx)->status)
-#define hash_ctx_error(ctx)      ((ctx)->error)
-#define hash_ctx_init(ctx) \
-	do { \
-		(ctx)->error = HASH_CTX_ERROR_NONE; \
-		(ctx)->status = HASH_CTX_STS_COMPLETE; \
-	} while (0)
-
-
-/* Hash Constants and Typedefs */
-#define SHA1_DIGEST_LENGTH          5
-#define SHA1_LOG2_BLOCK_SIZE        6
-
-#define SHA1_PADLENGTHFIELD_SIZE    8
-
-#ifdef SHA_MB_DEBUG
-#define assert(expr) \
-do { \
-	if (unlikely(!(expr))) { \
-		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
-		#expr, __FILE__, __func__, __LINE__); \
-	} \
-} while (0)
-#else
-#define assert(expr) do {} while (0)
-#endif
-
-struct sha1_ctx_mgr {
-	struct sha1_mb_mgr mgr;
-};
-
-/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
-
-struct sha1_hash_ctx {
-	/* Must be at struct offset 0 */
-	struct job_sha1       job;
-	/* status flag */
-	int status;
-	/* error flag */
-	int error;
-
-	uint32_t	total_length;
-	const void	*incoming_buffer;
-	uint32_t	incoming_buffer_length;
-	uint8_t		partial_block_buffer[SHA1_BLOCK_SIZE * 2];
-	uint32_t	partial_block_buffer_length;
-	void		*user_data;
-};
-
-#endif
diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
deleted file mode 100644
index 08ad1a9acfd7..000000000000
--- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Header file for multi buffer SHA1 algorithm manager
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of version 2 of the GNU General Public License as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  Contact Information:
- *      James Guilford <james.guilford@intel.com>
- *	Tim Chen <tim.c.chen@linux.intel.com>
- *
- *  BSD LICENSE
- *
- *  Copyright(c) 2014 Intel Corporation.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in
- *      the documentation and/or other materials provided with the
- *      distribution.
- *    * Neither the name of Intel Corporation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#ifndef __SHA_MB_MGR_H
-#define __SHA_MB_MGR_H
-
-
-#include <linux/types.h>
-
-#define NUM_SHA1_DIGEST_WORDS 5
-
-enum job_sts {	STS_UNKNOWN = 0,
-		STS_BEING_PROCESSED = 1,
-		STS_COMPLETED = 2,
-		STS_INTERNAL_ERROR = 3,
-		STS_ERROR = 4
-};
-
-struct job_sha1 {
-	u8	*buffer;
-	u32	len;
-	u32	result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
-	enum	job_sts status;
-	void	*user_data;
-};
-
-/* SHA1 out-of-order scheduler */
-
-/* typedef uint32_t sha1_digest_array[5][8]; */
-
-struct sha1_args_x8 {
-	uint32_t	digest[5][8];
-	uint8_t		*data_ptr[8];
-};
-
-struct sha1_lane_data {
-	struct job_sha1 *job_in_lane;
-};
-
-struct sha1_mb_mgr {
-	struct sha1_args_x8 args;
-
-	uint32_t lens[8];
-
-	/* each byte is index (0...7) of unused lanes */
-	uint64_t unused_lanes;
-	/* byte 4 is set to FF as a flag */
-	struct sha1_lane_data ldata[8];
-};
-
-
-#define SHA1_MB_MGR_NUM_LANES_AVX2 8
-
-void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
-struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
-					 struct job_sha1 *job);
-struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
-struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
-
-#endif
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
new file mode 100644
index 000000000000..2f8756375df5
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
+	sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
+	     sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
new file mode 100644
index 000000000000..561b2867b878
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -0,0 +1,1028 @@
+/*
+ * Multi buffer SHA1 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha1_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha1_mb_alg_state;
+
+struct sha1_mb_ctx {
+	struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+{
+	struct ahash_request *areq;
+
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+	return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+				struct ahash_request *areq)
+{
+	rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
+			(struct sha1_mb_mgr *state, struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
+						(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
+						(struct sha1_mb_mgr *state);
+
+static inline void sha1_init_digest(uint32_t *digest)
+{
+	static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0,
+					SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 };
+	memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
+			 uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+	memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((SHA1_BLOCK_SIZE - 1) &
+	      (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
+	     + 1 + SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+	/* Number of extra blocks to hash */
+	return i >> SHA1_LOG2_BLOCK_SIZE;
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
+						struct sha1_hash_ctx *ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			/* Clear PROCESSING bit */
+			ctx->status = HASH_CTX_STS_COMPLETE;
+			return ctx;
+		}
+
+		/*
+		 * If the extra blocks are empty, begin hashing what remains
+		 * in the user's buffer.
+		 */
+		if (ctx->partial_block_buffer_length == 0 &&
+		    ctx->incoming_buffer_length) {
+
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+			uint32_t copy_len;
+
+			/*
+			 * Only entire blocks can be hashed.
+			 * Copy remainder to extra blocks buffer.
+			 */
+			copy_len = len & (SHA1_BLOCK_SIZE-1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy(ctx->partial_block_buffer,
+				       ((const char *) buffer + len),
+				       copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			/* len should be a multiple of the block size now */
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			/* Set len to the number of blocks to be hashed */
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
+										&ctx->job);
+				continue;
+			}
+		}
+
+		/*
+		 * If the extra blocks are not empty, then we are
+		 * either on the last block(s) or we need more
+		 * user input before continuing.
+		 */
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks =
+					sha1_pad(buf, ctx->total_length);
+
+			ctx->status = (HASH_CTX_STS_PROCESSING |
+				       HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static struct sha1_hash_ctx
+			*sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+{
+	/*
+	 * If get_comp_job returns NULL, there are no jobs complete.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user.
+	 * If it is not ready, resubmit the job to finish processing.
+	 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+	 * still need processing.
+	 */
+	struct sha1_hash_ctx *ctx;
+
+	ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
+{
+	sha1_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
+					  struct sha1_hash_ctx *ctx,
+					  const void *buffer,
+					  uint32_t len,
+					  int flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		/*
+		 * User should not pass anything other than FIRST, UPDATE, or
+		 * LAST
+		 */
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		/* Cannot submit to a currently processing job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		/* Cannot update a finished job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+
+	if (flags & HASH_FIRST) {
+		/* Init digest */
+		sha1_init_digest(ctx->job.result_digest);
+
+		/* Reset byte counter */
+		ctx->total_length = 0;
+
+		/* Clear extra blocks */
+		ctx->partial_block_buffer_length = 0;
+	}
+
+	/*
+	 * If we made it here, there were no errors during this call to
+	 * submit
+	 */
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	/* Store buffer ptr info from user */
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	/*
+	 * Store the user's request flags and mark this ctx as currently
+	 * being processed.
+	 */
+	ctx->status = (flags & HASH_LAST) ?
+			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+			HASH_CTX_STS_PROCESSING;
+
+	/* Advance byte counter */
+	ctx->total_length += len;
+
+	/*
+	 * If there is anything currently buffered in the extra blocks,
+	 * append to it until it contains a whole block.
+	 * Or if the user's buffer contains less than a whole block,
+	 * append as much as possible to the extra block.
+	 */
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		/*
+		 * Compute how many bytes to copy from user buffer into
+		 * extra block
+		 */
+		uint32_t copy_len = SHA1_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			/* Copy and update relevant pointers and counters */
+			memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+				buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+
+		/*
+		 * The extra block should never contain more than 1 block
+		 * here
+		 */
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		/*
+		 * If the extra block buffer contains exactly 1 block, it can
+		 * be hashed.
+		 */
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
+{
+	struct sha1_hash_ctx *ctx;
+
+	while (1) {
+		ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
+
+		/* If flush returned 0, there are no more jobs in flight. */
+		if (!ctx)
+			return NULL;
+
+		/*
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
+		 */
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		/*
+		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
+		 * returned. Otherwise, all jobs currently being managed by the
+		 * sha1_ctx_mgr still need processing. Loop.
+		 */
+		if (ctx)
+			return ctx;
+	}
+}
+
+static int sha1_mb_init(struct ahash_request *areq)
+{
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	hash_ctx_init(sctx);
+	sctx->job.result_digest[0] = SHA1_H0;
+	sctx->job.result_digest[1] = SHA1_H1;
+	sctx->job.result_digest[2] = SHA1_H2;
+	sctx->job.result_digest[3] = SHA1_H3;
+	sctx->job.result_digest[4] = SHA1_H4;
+	sctx->total_length = 0;
+	sctx->partial_block_buffer_length = 0;
+	sctx->status = HASH_CTX_STS_IDLE;
+
+	return 0;
+}
+
+static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+	int	i;
+	struct	sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+	__be32	*dst = (__be32 *) rctx->out;
+
+	for (i = 0; i < 5; ++i)
+		dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+	return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+			struct mcryptd_alg_cstate *cstate, bool flush)
+{
+	int	flag = HASH_UPDATE;
+	int	nbytes, err = 0;
+	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+	struct sha1_hash_ctx *sha_ctx;
+
+	/* more work ? */
+	while (!(rctx->flag & HASH_DONE)) {
+		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+		if (nbytes < 0) {
+			err = nbytes;
+			goto out;
+		}
+		/* check if the walk is done */
+		if (crypto_ahash_walk_last(&rctx->walk)) {
+			rctx->flag |= HASH_DONE;
+			if (rctx->flag & HASH_FINAL)
+				flag |= HASH_LAST;
+
+		}
+		sha_ctx = (struct sha1_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
+		kernel_fpu_begin();
+		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
+		if (!sha_ctx) {
+			if (flush)
+				sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
+		}
+		kernel_fpu_end();
+		if (sha_ctx)
+			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		else {
+			rctx = NULL;
+			goto out;
+		}
+	}
+
+	/* copy the results */
+	if (rctx->flag & HASH_FINAL)
+		sha1_mb_set_results(rctx);
+
+out:
+	*ret_rctx = rctx;
+	return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+			    struct mcryptd_alg_cstate *cstate,
+			    int err)
+{
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha1_hash_ctx *sha_ctx;
+	struct mcryptd_hash_request_ctx *req_ctx;
+	int ret;
+
+	/* remove from work list */
+	spin_lock(&cstate->work_lock);
+	list_del(&rctx->waiter);
+	spin_unlock(&cstate->work_lock);
+
+	if (irqs_disabled())
+		rctx->complete(&req->base, err);
+	else {
+		local_bh_disable();
+		rctx->complete(&req->base, err);
+		local_bh_enable();
+	}
+
+	/* check to see if there are other jobs that are done */
+	sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+	while (sha_ctx) {
+		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		ret = sha_finish_walk(&req_ctx, cstate, false);
+		if (req_ctx) {
+			spin_lock(&cstate->work_lock);
+			list_del(&req_ctx->waiter);
+			spin_unlock(&cstate->work_lock);
+
+			req = cast_mcryptd_ctx_to_req(req_ctx);
+			if (irqs_disabled())
+				req_ctx->complete(&req->base, ret);
+			else {
+				local_bh_disable();
+				req_ctx->complete(&req->base, ret);
+				local_bh_enable();
+			}
+		}
+		sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+	}
+
+	return 0;
+}
+
+static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+			     struct mcryptd_alg_cstate *cstate)
+{
+	unsigned long next_flush;
+	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+	/* initialize tag */
+	rctx->tag.arrival = jiffies;    /* tag the arrival time */
+	rctx->tag.seq_num = cstate->next_seq_num++;
+	next_flush = rctx->tag.arrival + delay;
+	rctx->tag.expire = next_flush;
+
+	spin_lock(&cstate->work_lock);
+	list_add_tail(&rctx->waiter, &cstate->work_list);
+	spin_unlock(&cstate->work_lock);
+
+	mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha1_mb_update(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha1_hash_ctx *sha_ctx;
+	int ret = 0, nbytes;
+
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk))
+		rctx->flag |= HASH_DONE;
+
+	/* submit */
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+	sha1_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha1_mb_finup(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha1_hash_ctx *sha_ctx;
+	int ret = 0, flag = HASH_UPDATE, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk)) {
+		rctx->flag |= HASH_DONE;
+		flag = HASH_LAST;
+	}
+
+	/* submit */
+	rctx->flag |= HASH_FINAL;
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+	sha1_mb_add_list(rctx, cstate);
+
+	kernel_fpu_begin();
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha1_mb_final(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+	struct sha1_hash_ctx *sha_ctx;
+	int ret = 0;
+	u8 data;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	rctx->flag |= HASH_DONE | HASH_FINAL;
+
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+	/* flag HASH_FINAL and 0 data size */
+	sha1_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha1_mb_export(struct ahash_request *areq, void *out)
+{
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_mb_import(struct ahash_request *areq, const void *in)
+{
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
+
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha1_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha1_hash_ctx));
+
+	return 0;
+}
+
+static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_areq_alg = {
+	.init		=	sha1_mb_init,
+	.update		=	sha1_mb_update,
+	.final		=	sha1_mb_final,
+	.finup		=	sha1_mb_finup,
+	.export		=	sha1_mb_export,
+	.import		=	sha1_mb_import,
+	.halg		=	{
+		.digestsize	=	SHA1_DIGEST_SIZE,
+		.statesize	=	sizeof(struct sha1_hash_ctx),
+		.base		=	{
+			.cra_name	 = "__sha1-mb",
+			.cra_driver_name = "__intel_sha1-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA1_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha1_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha1_mb_areq_init_tfm,
+			.cra_exit	= sha1_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha1_hash_ctx),
+		}
+	}
+};
+
+static int sha1_mb_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha1_mb_async_update(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha1_mb_async_finup(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha1_mb_async_final(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha1_mb_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct ahash_request *areq;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+	areq = &rctx->areq;
+
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha1_mb_async_alg = {
+	.init           = sha1_mb_async_init,
+	.update         = sha1_mb_async_update,
+	.final          = sha1_mb_async_final,
+	.finup          = sha1_mb_async_finup,
+	.digest         = sha1_mb_async_digest,
+	.export		= sha1_mb_async_export,
+	.import		= sha1_mb_async_import,
+	.halg = {
+		.digestsize     = SHA1_DIGEST_SIZE,
+		.statesize	= sizeof(struct sha1_hash_ctx),
+		.base = {
+			.cra_name               = "sha1",
+			.cra_driver_name        = "sha1_mb",
+			.cra_priority           = 200,
+			.cra_flags              = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+			.cra_blocksize          = SHA1_BLOCK_SIZE,
+			.cra_type               = &crypto_ahash_type,
+			.cra_module             = THIS_MODULE,
+			.cra_list               = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
+			.cra_init               = sha1_mb_async_init_tfm,
+			.cra_exit               = sha1_mb_async_exit_tfm,
+			.cra_ctxsize		= sizeof(struct sha1_mb_ctx),
+			.cra_alignmask		= 0,
+		},
+	},
+};
+
+static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+	struct mcryptd_hash_request_ctx *rctx;
+	unsigned long cur_time;
+	unsigned long next_flush = 0;
+	struct sha1_hash_ctx *sha_ctx;
+
+
+	cur_time = jiffies;
+
+	while (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		if (time_before(cur_time, rctx->tag.expire))
+			break;
+		kernel_fpu_begin();
+		sha_ctx = (struct sha1_hash_ctx *)
+					sha1_ctx_mgr_flush(cstate->mgr);
+		kernel_fpu_end();
+		if (!sha_ctx) {
+			pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
+			break;
+		}
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		sha_finish_walk(&rctx, cstate, true);
+		sha_complete_job(rctx, cstate, 0);
+	}
+
+	if (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		/* get the hash context and then flush time */
+		next_flush = rctx->tag.expire;
+		mcryptd_arm_flusher(cstate, get_delay(next_flush));
+	}
+	return next_flush;
+}
+
+static int __init sha1_mb_mod_init(void)
+{
+
+	int cpu;
+	int err;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	/* check for dependent cpu features */
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_BMI2))
+		return -ENODEV;
+
+	/* initialize multibuffer structures */
+	sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
+
+	sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
+	sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
+	sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
+	sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
+
+	if (!sha1_mb_alg_state.alg_cstate)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+		cpu_state->next_flush = 0;
+		cpu_state->next_seq_num = 0;
+		cpu_state->flusher_engaged = false;
+		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+		cpu_state->cpu = cpu;
+		cpu_state->alg_state = &sha1_mb_alg_state;
+		cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+					GFP_KERNEL);
+		if (!cpu_state->mgr)
+			goto err2;
+		sha1_ctx_mgr_init(cpu_state->mgr);
+		INIT_LIST_HEAD(&cpu_state->work_list);
+		spin_lock_init(&cpu_state->work_lock);
+	}
+	sha1_mb_alg_state.flusher = &sha1_mb_flusher;
+
+	err = crypto_register_ahash(&sha1_mb_areq_alg);
+	if (err)
+		goto err2;
+	err = crypto_register_ahash(&sha1_mb_async_alg);
+	if (err)
+		goto err1;
+
+
+	return 0;
+err1:
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
+err2:
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha1_mb_alg_state.alg_cstate);
+	return -ENODEV;
+}
+
+static void __exit sha1_mb_mod_fini(void)
+{
+	int cpu;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	crypto_unregister_ahash(&sha1_mb_async_alg);
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha1_mb_alg_state.alg_cstate);
+}
+
+module_init(sha1_mb_mod_init);
+module_exit(sha1_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
new file mode 100644
index 000000000000..98a35bcc6f4a
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
@@ -0,0 +1,136 @@
+/*
+ * Header file for multi buffer SHA context
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha1_mb_mgr.h"
+
+#define HASH_UPDATE          0x00
+#define HASH_FIRST           0x01
+#define HASH_LAST            0x02
+#define HASH_ENTIRE          0x03
+#define HASH_DONE	     0x04
+#define HASH_FINAL	     0x08
+
+#define HASH_CTX_STS_IDLE       0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST       0x02
+#define HASH_CTX_STS_COMPLETE   0x04
+
+enum hash_ctx_error {
+	HASH_CTX_ERROR_NONE               =  0,
+	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
+	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
+
+#ifdef HASH_CTX_DEBUG
+	HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA1_DIGEST_LENGTH          5
+#define SHA1_LOG2_BLOCK_SIZE        6
+
+#define SHA1_PADLENGTHFIELD_SIZE    8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+	if (unlikely(!(expr))) { \
+		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+		#expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha1_ctx_mgr {
+	struct sha1_mb_mgr mgr;
+};
+
+/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
+
+struct sha1_hash_ctx {
+	/* Must be at struct offset 0 */
+	struct job_sha1       job;
+	/* status flag */
+	int status;
+	/* error flag */
+	int error;
+
+	uint32_t	total_length;
+	const void	*incoming_buffer;
+	uint32_t	incoming_buffer_length;
+	uint8_t		partial_block_buffer[SHA1_BLOCK_SIZE * 2];
+	uint32_t	partial_block_buffer_length;
+	void		*user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
new file mode 100644
index 000000000000..08ad1a9acfd7
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
@@ -0,0 +1,110 @@
+/*
+ * Header file for multi buffer SHA1 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+
+#include <linux/types.h>
+
+#define NUM_SHA1_DIGEST_WORDS 5
+
+enum job_sts {	STS_UNKNOWN = 0,
+		STS_BEING_PROCESSED = 1,
+		STS_COMPLETED = 2,
+		STS_INTERNAL_ERROR = 3,
+		STS_ERROR = 4
+};
+
+struct job_sha1 {
+	u8	*buffer;
+	u32	len;
+	u32	result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
+	enum	job_sts status;
+	void	*user_data;
+};
+
+/* SHA1 out-of-order scheduler */
+
+/* typedef uint32_t sha1_digest_array[5][8]; */
+
+struct sha1_args_x8 {
+	uint32_t	digest[5][8];
+	uint8_t		*data_ptr[8];
+};
+
+struct sha1_lane_data {
+	struct job_sha1 *job_in_lane;
+};
+
+struct sha1_mb_mgr {
+	struct sha1_args_x8 args;
+
+	uint32_t lens[8];
+
+	/* each byte is index (0...7) of unused lanes */
+	uint64_t unused_lanes;
+	/* byte 4 is set to FF as a flag */
+	struct sha1_lane_data ldata[8];
+};
+
+
+#define SHA1_MB_MGR_NUM_LANES_AVX2 8
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
+					 struct job_sha1 *job);
+struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..86688c6e7a25
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
@@ -0,0 +1,287 @@
+/*
+ * Header file for multi buffer SHA1 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS	# JOB_AES
+###	name		size	align
+#FIELD	_plaintext,	8,	8	# pointer to plaintext
+#FIELD	_ciphertext,	8,	8	# pointer to ciphertext
+#FIELD	_IV,		16,	8	# IV
+#FIELD	_keys,		8,	8	# pointer to keys
+#FIELD	_len,		4,	4	# length in bytes
+#FIELD	_status,	4,	4	# status enumeration
+#FIELD	_user_data,	8,	8	# pointer to user data
+#UNION  _union,         size1,  align1, \
+#	                size2,  align2, \
+#	                size3,  align3, \
+#	                ...
+#END_FIELDS
+#%assign _JOB_AES_size	_FIELD_OFFSET
+#%assign _JOB_AES_align	_STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+#	STRUCT job_aes2
+#	RES_Q	.plaintext,	1
+#	RES_Q	.ciphertext,	1
+#	RES_DQ	.IV,		1
+#	RES_B	.nested,	_JOB_AES_SIZE, _JOB_AES_ALIGN
+#	RES_U	.union,		size1, align1, \
+#				size2, align2, \
+#				...
+#	ENDSTRUCT
+#	# Following only needed if nesting
+#	%assign job_aes2_size	_FIELD_OFFSET
+#	%assign job_aes2_align	_STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro    Base size
+# RES_B	    1
+# RES_W	    2
+# RES_D     4
+# RES_Q     8
+# RES_DQ   16
+# RES_Y    32
+# RES_Z    64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
+#define _SHA1_MB_MGR_DATASTRUCT_ASM_
+
+## START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+## FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name	= _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+## END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+	.lcomm	tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+
+#endif
+
+########################################################################
+#### Define constants
+########################################################################
+
+########################################################################
+#### Define SHA1 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS    # LANE_DATA
+###     name            size    align
+FIELD   _job_in_lane,   8,      8       # pointer to job object
+END_FIELDS
+
+_LANE_DATA_size = _FIELD_OFFSET
+_LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS    # SHA1_ARGS_X8
+###     name            size    align
+FIELD   _digest,        4*5*8,  16      # transposed digest
+FIELD   _data_ptr,      8*8,    8       # array of pointers to data
+END_FIELDS
+
+_SHA1_ARGS_X4_size =     _FIELD_OFFSET
+_SHA1_ARGS_X4_align =    _STRUCT_ALIGN
+_SHA1_ARGS_X8_size =     _FIELD_OFFSET
+_SHA1_ARGS_X8_align =    _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS    # MB_MGR
+###     name            size    align
+FIELD   _args,          _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD   _lens,          4*8,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+_MB_MGR_size =   _FIELD_OFFSET
+_MB_MGR_align =  _STRUCT_ALIGN
+
+_args_digest    =     _args + _digest
+_args_data_ptr  =     _args + _data_ptr
+
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN             0
+#define STS_BEING_PROCESSED     1
+#define STS_COMPLETED           2
+
+########################################################################
+#### Define JOB_SHA1 structure
+########################################################################
+
+START_FIELDS    # JOB_SHA1
+
+###     name                            size    align
+FIELD   _buffer,                        8,      8       # pointer to buffer
+FIELD   _len,                           4,      4       # length in bytes
+FIELD   _result_digest,                 5*4,    32      # Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+_JOB_SHA1_size =  _FIELD_OFFSET
+_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..96df6a39d7e2
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,302 @@
+/*
+ * Flush routine for SHA1 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx2
+
+# LINUX register definitions
+#define arg1    %rdi
+#define arg2    %rsi
+
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+
+# idx must be a register not clobbered by sha1_x8_avx2
+#define idx		%r8
+#define DWORD_idx	%r8d
+
+#define unused_lanes    %rbx
+#define lane_data       %rbx
+#define tmp2            %rbx
+#define tmp2_w		%ebx
+
+#define job_rax         %rax
+#define tmp1            %rax
+#define size_offset     %rax
+#define tmp             %rax
+#define start_offset    %rax
+
+#define tmp3            %arg1
+
+#define extra_blocks    %arg2
+#define p               %arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha1_mb_mgr_flush_avx2)
+	FRAME_BEGIN
+	push	%rbx
+
+	# If bit (32+3) is set, then all lanes are empty
+	mov     _unused_lanes(state), unused_lanes
+	bt      $32+3, unused_lanes
+	jc      return_null
+
+	# find a lane with a non-null job
+	xor     idx, idx
+	offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  one(%rip), idx
+	offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  two(%rip), idx
+	offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  three(%rip), idx
+	offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  four(%rip), idx
+	offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  five(%rip), idx
+	offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  six(%rip), idx
+	offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+	cmovne  seven(%rip), idx
+
+	# copy idx to empty lanes
+copy_lane_data:
+	offset =  (_args + _data_ptr)
+	mov     offset(state,idx,8), tmp
+
+	I = 0
+.rep 8
+	offset =  (_ldata + I * _LANE_DATA_size + _job_in_lane)
+	cmpq    $0, offset(state)
+.altmacro
+	JNE_SKIP %I
+	offset =  (_args + _data_ptr + 8*I)
+	mov     tmp, offset(state)
+	offset =  (_lens + 4*I)
+	movl    $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+	I = (I+1)
+.noaltmacro
+.endr
+
+	# Find min length
+	vmovdqa _lens+0*16(state), %xmm0
+	vmovdqa _lens+1*16(state), %xmm1
+
+	vpminud %xmm1, %xmm0, %xmm2     # xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+
+	vmovd   %xmm2, DWORD_idx
+	mov	idx, len2
+	and	$0xF, idx
+	shr	$4, len2
+	jz	len_is_0
+
+	vpand   clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd $0, %xmm2, %xmm2
+
+	vpsubd  %xmm2, %xmm0, %xmm0
+	vpsubd  %xmm2, %xmm1, %xmm1
+
+	vmovdqa %xmm0, _lens+0*16(state)
+	vmovdqa %xmm1, _lens+1*16(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call	sha1_x8_avx2
+	# state and idx are intact
+
+
+len_is_0:
+	# process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+	lea     _ldata(state, lane_data), lane_data
+
+	mov     _job_in_lane(lane_data), job_rax
+	movq    $0, _job_in_lane(lane_data)
+	movl    $STS_COMPLETED, _status(job_rax)
+	mov     _unused_lanes(state), unused_lanes
+	shl     $4, unused_lanes
+	or      idx, unused_lanes
+	mov     unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state, idx, 4)
+
+	vmovd    _args_digest(state , idx, 4) , %xmm0
+	vpinsrd  $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd  $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd  $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	movl    _args_digest+4*32(state, idx, 4), tmp2_w
+
+	vmovdqu  %xmm0, _result_digest(job_rax)
+	offset =  (_result_digest + 1*16)
+	mov     tmp2_w, offset(job_rax)
+
+return:
+	pop	%rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+ENDPROC(sha1_mb_mgr_flush_avx2)
+
+
+#################################################################
+
+.align 16
+ENTRY(sha1_mb_mgr_get_comp_job_avx2)
+	push    %rbx
+
+	## if bit 32+3 is set, then all lanes are empty
+	mov     _unused_lanes(state), unused_lanes
+	bt      $(32+3), unused_lanes
+	jc      .return_null
+
+	# Find min length
+	vmovdqa _lens(state), %xmm0
+	vmovdqa _lens+1*16(state), %xmm1
+
+	vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+
+	vmovd   %xmm2, DWORD_idx
+	test    $~0xF, idx
+	jnz     .return_null
+
+	# process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+	lea     _ldata(state, lane_data), lane_data
+
+	mov     _job_in_lane(lane_data), job_rax
+	movq    $0,  _job_in_lane(lane_data)
+	movl    $STS_COMPLETED, _status(job_rax)
+	mov     _unused_lanes(state), unused_lanes
+	shl     $4, unused_lanes
+	or      idx, unused_lanes
+	mov     unused_lanes, _unused_lanes(state)
+
+	movl    $0xFFFFFFFF, _lens(state,  idx, 4)
+
+	vmovd   _args_digest(state, idx, 4), %xmm0
+	vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	movl    _args_digest+4*32(state, idx, 4), tmp2_w
+
+	vmovdqu %xmm0, _result_digest(job_rax)
+	movl    tmp2_w, _result_digest+1*16(job_rax)
+
+	pop     %rbx
+
+	ret
+
+.return_null:
+	xor     job_rax, job_rax
+	pop     %rbx
+	ret
+ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+.octa	0x000000000000000000000000FFFFFFF0
+one:
+.quad  1
+two:
+.quad  2
+three:
+.quad  3
+four:
+.quad  4
+five:
+.quad  5
+six:
+.quad  6
+seven:
+.quad  7
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..d2add0d35f43
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Initialization code for multi buffer SHA1 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha1_mb_mgr.h"
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF76543210ULL;
+	for (j = 0; j < 8; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = NULL;
+	}
+}
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..63a0d9c8e31f
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,210 @@
+/*
+ * Buffer submit code for multi buffer SHA1 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx
+
+# LINUX register definitions
+arg1    = %rdi
+arg2    = %rsi
+size_offset	= %rcx
+tmp2		= %rcx
+extra_blocks	= %rdx
+
+# Common definitions
+#define state   arg1
+#define job     %rsi
+#define len2    arg2
+#define p2      arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx		= %r8
+DWORD_idx	= %r8d
+last_len	= %r8
+
+p               = %r11
+start_offset    = %r11
+
+unused_lanes    = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax         = %rax
+len             = %rax
+DWORD_len	= %eax
+
+lane            = %r12
+tmp3            = %r12
+
+tmp             = %r9
+DWORD_tmp	= %r9d
+
+lane_data       = %r10
+
+# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha1_mb_mgr_submit_avx2)
+	FRAME_BEGIN
+	push	%rbx
+	push	%r12
+
+	mov     _unused_lanes(state), unused_lanes
+	mov	unused_lanes, lane
+	and	$0xF, lane
+	shr     $4, unused_lanes
+	imul    $_LANE_DATA_size, lane, lane_data
+	movl    $STS_BEING_PROCESSED, _status(job)
+	lea     _ldata(state, lane_data), lane_data
+	mov     unused_lanes, _unused_lanes(state)
+	movl    _len(job),  DWORD_len
+
+	mov	job, _job_in_lane(lane_data)
+	shl	$4, len
+	or	lane, len
+
+	movl    DWORD_len,  _lens(state , lane, 4)
+
+	# Load digest words from result_digest
+	vmovdqu	_result_digest(job), %xmm0
+	mov	_result_digest+1*16(job), DWORD_tmp
+	vmovd    %xmm0, _args_digest(state, lane, 4)
+	vpextrd  $1, %xmm0, _args_digest+1*32(state , lane, 4)
+	vpextrd  $2, %xmm0, _args_digest+2*32(state , lane, 4)
+	vpextrd  $3, %xmm0, _args_digest+3*32(state , lane, 4)
+	movl    DWORD_tmp, _args_digest+4*32(state , lane, 4)
+
+	mov     _buffer(job), p
+	mov     p, _args_data_ptr(state, lane, 8)
+
+	cmp     $0xF, unused_lanes
+	jne     return_null
+
+start_loop:
+	# Find min length
+	vmovdqa _lens(state), %xmm0
+	vmovdqa _lens+1*16(state), %xmm1
+
+	vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,x,E}
+	vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+
+	vmovd   %xmm2, DWORD_idx
+	mov    idx, len2
+	and    $0xF, idx
+	shr    $4, len2
+	jz     len_is_0
+
+	vpand   clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd $0, %xmm2, %xmm2
+
+	vpsubd  %xmm2, %xmm0, %xmm0
+	vpsubd  %xmm2, %xmm1, %xmm1
+
+	vmovdqa %xmm0, _lens + 0*16(state)
+	vmovdqa %xmm1, _lens + 1*16(state)
+
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call    sha1_x8_avx2
+
+	# state and idx are intact
+
+len_is_0:
+	# process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+	lea     _ldata(state, lane_data), lane_data
+
+	mov     _job_in_lane(lane_data), job_rax
+	mov     _unused_lanes(state), unused_lanes
+	movq    $0, _job_in_lane(lane_data)
+	movl    $STS_COMPLETED, _status(job_rax)
+	shl     $4, unused_lanes
+	or      idx, unused_lanes
+	mov     unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state, idx, 4)
+
+	vmovd    _args_digest(state, idx, 4), %xmm0
+	vpinsrd  $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd  $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd  $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+	movl     _args_digest+4*32(state, idx, 4), DWORD_tmp
+
+	vmovdqu  %xmm0, _result_digest(job_rax)
+	movl    DWORD_tmp, _result_digest+1*16(job_rax)
+
+return:
+	pop	%r12
+	pop	%rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+ENDPROC(sha1_mb_mgr_submit_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
new file mode 100644
index 000000000000..c9dae1cd2919
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -0,0 +1,481 @@
+/*
+ * Multi-buffer SHA1 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+## code to compute oct SHA1 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+
+## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15# ymm0-15
+##
+## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+## Linux preserves:                       rdi rbp r8
+##
+## clobbers ymm0-15
+
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+	# process top half (r0..r3) {a...d}
+	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps  $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps  $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps  $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps  $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	# use r2 in place of t0
+	# process bottom half (r4..r7) {e...h}
+	vshufps  $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps  $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps  $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps  $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps  $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps  $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps  $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps  $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6
+	vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2
+	vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5
+	vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1
+	vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7
+	vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3
+	vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4
+	vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0
+
+.endm
+##
+## Magic functions defined in FIPS 180-1
+##
+# macro MAGIC_F0 F,B,C,D,T   ## F = (D ^ (B & (C ^ D)))
+.macro MAGIC_F0 regF regB regC regD regT
+    vpxor \regD, \regC, \regF
+    vpand \regB, \regF, \regF
+    vpxor \regD, \regF, \regF
+.endm
+
+# macro MAGIC_F1 F,B,C,D,T   ## F = (B ^ C ^ D)
+.macro MAGIC_F1 regF regB regC regD regT
+    vpxor  \regC, \regD, \regF
+    vpxor  \regB, \regF, \regF
+.endm
+
+# macro MAGIC_F2 F,B,C,D,T   ## F = ((B & C) | (B & D) | (C & D))
+.macro MAGIC_F2 regF regB regC regD regT
+    vpor  \regC, \regB, \regF
+    vpand \regC, \regB, \regT
+    vpand \regD, \regF, \regF
+    vpor  \regT, \regF, \regF
+.endm
+
+# macro MAGIC_F3 F,B,C,D,T   ## F = (B ^ C ^ D)
+.macro MAGIC_F3 regF regB regC regD regT
+    MAGIC_F1 \regF,\regB,\regC,\regD,\regT
+.endm
+
+# PROLD reg, imm, tmp
+.macro PROLD reg imm tmp
+	vpsrld  $(32-\imm), \reg, \tmp
+	vpslld  $\imm, \reg, \reg
+	vpor    \tmp, \reg, \reg
+.endm
+
+.macro PROLD_nd reg imm tmp src
+	vpsrld  $(32-\imm), \src, \tmp
+	vpslld  $\imm, \src, \reg
+	vpor	\tmp, \reg, \reg
+.endm
+
+.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
+	vpaddd	\immCNT, \regE, \regE
+	vpaddd	\memW*32(%rsp), \regE, \regE
+	PROLD_nd \regT, 5, \regF, \regA
+	vpaddd	\regT, \regE, \regE
+	\MAGIC  \regF, \regB, \regC, \regD, \regT
+        PROLD   \regB, 30, \regT
+        vpaddd  \regF, \regE, \regE
+.endm
+
+.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
+	vpaddd	\immCNT, \regE, \regE
+	offset = ((\memW - 14) & 15) * 32
+	vmovdqu offset(%rsp), W14
+	vpxor	W14, W16, W16
+	offset = ((\memW -  8) & 15) * 32
+	vpxor	offset(%rsp), W16, W16
+	offset = ((\memW -  3) & 15) * 32
+	vpxor	offset(%rsp), W16, W16
+	vpsrld	$(32-1), W16, \regF
+	vpslld	$1, W16, W16
+	vpor	W16, \regF, \regF
+
+	ROTATE_W
+
+	offset = ((\memW - 0) & 15) * 32
+	vmovdqu	\regF, offset(%rsp)
+	vpaddd	\regF, \regE, \regE
+	PROLD_nd \regT, 5, \regF, \regA
+	vpaddd	\regT, \regE, \regE
+	\MAGIC \regF,\regB,\regC,\regD,\regT      ## FUN  = MAGIC_Fi(B,C,D)
+	PROLD   \regB,30, \regT
+	vpaddd  \regF, \regE, \regE
+.endm
+
+########################################################################
+########################################################################
+########################################################################
+
+## FRAMESZ plus pushes must be an odd multiple of 8
+YMM_SAVE = (15-15)*32
+FRAMESZ = 32*16 + YMM_SAVE
+_YMM  =   FRAMESZ - YMM_SAVE
+
+#define VMOVPS   vmovups
+
+IDX  = %rax
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = %rcx
+arg1 = %rdi
+arg2 = %rsi
+RSP_SAVE = %rdx
+
+# ymm0 A
+# ymm1 B
+# ymm2 C
+# ymm3 D
+# ymm4 E
+# ymm5         F       AA
+# ymm6         T0      BB
+# ymm7         T1      CC
+# ymm8         T2      DD
+# ymm9         T3      EE
+# ymm10                T4      TMP
+# ymm11                T5      FUN
+# ymm12                T6      K
+# ymm13                T7      W14
+# ymm14                T8      W15
+# ymm15                T9      W16
+
+
+A  =     %ymm0
+B  =     %ymm1
+C  =     %ymm2
+D  =     %ymm3
+E  =     %ymm4
+F  =     %ymm5
+T0 =	 %ymm6
+T1 =     %ymm7
+T2 =     %ymm8
+T3 =     %ymm9
+T4 =     %ymm10
+T5 =     %ymm11
+T6 =     %ymm12
+T7 =     %ymm13
+T8  =     %ymm14
+T9  =     %ymm15
+
+AA  =     %ymm5
+BB  =     %ymm6
+CC  =     %ymm7
+DD  =     %ymm8
+EE  =     %ymm9
+TMP =     %ymm10
+FUN =     %ymm11
+K   =     %ymm12
+W14 =     %ymm13
+W15 =     %ymm14
+W16 =     %ymm15
+
+.macro ROTATE_ARGS
+ TMP_ = E
+ E = D
+ D = C
+ C = B
+ B = A
+ A = TMP_
+.endm
+
+.macro ROTATE_W
+TMP_  = W16
+W16  = W15
+W15  = W14
+W14  = TMP_
+.endm
+
+# 8 streams x 5 32bit words per digest x 4 bytes per word
+#define DIGEST_SIZE (8*5*4)
+
+.align 32
+
+# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
+# arg 1 : pointer to array[4] of pointer to input data
+# arg 2 : size (in blocks) ;; assumed to be >= 1
+#
+ENTRY(sha1_x8_avx2)
+
+	# save callee-saved clobbered registers to comply with C function ABI
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	#save rsp
+	mov	%rsp, RSP_SAVE
+	sub     $FRAMESZ, %rsp
+
+	#align rsp to 32 Bytes
+	and	$~0x1F, %rsp
+
+	## Initialize digests
+	vmovdqu  0*32(arg1), A
+	vmovdqu  1*32(arg1), B
+	vmovdqu  2*32(arg1), C
+	vmovdqu  3*32(arg1), D
+	vmovdqu  4*32(arg1), E
+
+	## transpose input onto stack
+	mov     _data_ptr+0*8(arg1),inp0
+	mov     _data_ptr+1*8(arg1),inp1
+	mov     _data_ptr+2*8(arg1),inp2
+	mov     _data_ptr+3*8(arg1),inp3
+	mov     _data_ptr+4*8(arg1),inp4
+	mov     _data_ptr+5*8(arg1),inp5
+	mov     _data_ptr+6*8(arg1),inp6
+	mov     _data_ptr+7*8(arg1),inp7
+
+	xor     IDX, IDX
+lloop:
+	vmovdqu  PSHUFFLE_BYTE_FLIP_MASK(%rip), F
+	I=0
+.rep 2
+	VMOVPS   (inp0, IDX), T0
+	VMOVPS   (inp1, IDX), T1
+	VMOVPS   (inp2, IDX), T2
+	VMOVPS   (inp3, IDX), T3
+	VMOVPS   (inp4, IDX), T4
+	VMOVPS   (inp5, IDX), T5
+	VMOVPS   (inp6, IDX), T6
+	VMOVPS   (inp7, IDX), T7
+
+	TRANSPOSE8       T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+	vpshufb  F, T0, T0
+	vmovdqu  T0, (I*8)*32(%rsp)
+	vpshufb  F, T1, T1
+	vmovdqu  T1, (I*8+1)*32(%rsp)
+	vpshufb  F, T2, T2
+	vmovdqu  T2, (I*8+2)*32(%rsp)
+	vpshufb  F, T3, T3
+	vmovdqu  T3, (I*8+3)*32(%rsp)
+	vpshufb  F, T4, T4
+	vmovdqu  T4, (I*8+4)*32(%rsp)
+	vpshufb  F, T5, T5
+	vmovdqu  T5, (I*8+5)*32(%rsp)
+	vpshufb  F, T6, T6
+	vmovdqu  T6, (I*8+6)*32(%rsp)
+	vpshufb  F, T7, T7
+	vmovdqu  T7, (I*8+7)*32(%rsp)
+	add     $32, IDX
+	I = (I+1)
+.endr
+	# save old digests
+	vmovdqu  A,AA
+	vmovdqu  B,BB
+	vmovdqu  C,CC
+	vmovdqu  D,DD
+	vmovdqu  E,EE
+
+##
+## perform 0-79 steps
+##
+	vmovdqu  K00_19(%rip), K
+## do rounds 0...15
+	I = 0
+.rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+	I = (I+1)
+.endr
+
+## do rounds 16...19
+	vmovdqu  ((16 - 16) & 15) * 32 (%rsp), W16
+	vmovdqu  ((16 - 15) & 15) * 32 (%rsp), W15
+.rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+	I = (I+1)
+.endr
+
+## do rounds 20...39
+	vmovdqu  K20_39(%rip), K
+.rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+	ROTATE_ARGS
+	I = (I+1)
+.endr
+
+## do rounds 40...59
+	vmovdqu  K40_59(%rip), K
+.rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+	ROTATE_ARGS
+	I = (I+1)
+.endr
+
+## do rounds 60...79
+	vmovdqu  K60_79(%rip), K
+.rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+	ROTATE_ARGS
+	I = (I+1)
+.endr
+
+	vpaddd   AA,A,A
+	vpaddd   BB,B,B
+	vpaddd   CC,C,C
+	vpaddd   DD,D,D
+	vpaddd   EE,E,E
+
+	sub     $1, arg2
+	jne     lloop
+
+	# write out digests
+	vmovdqu  A, 0*32(arg1)
+	vmovdqu  B, 1*32(arg1)
+	vmovdqu  C, 2*32(arg1)
+	vmovdqu  D, 3*32(arg1)
+	vmovdqu  E, 4*32(arg1)
+
+	# update input pointers
+	add     IDX, inp0
+	add     IDX, inp1
+	add     IDX, inp2
+	add     IDX, inp3
+	add     IDX, inp4
+	add     IDX, inp5
+	add     IDX, inp6
+	add     IDX, inp7
+	mov     inp0, _data_ptr (arg1)
+	mov     inp1, _data_ptr + 1*8(arg1)
+	mov     inp2, _data_ptr + 2*8(arg1)
+	mov     inp3, _data_ptr + 3*8(arg1)
+	mov     inp4, _data_ptr + 4*8(arg1)
+	mov     inp5, _data_ptr + 5*8(arg1)
+	mov     inp6, _data_ptr + 6*8(arg1)
+	mov     inp7, _data_ptr + 7*8(arg1)
+
+	################
+	## Postamble
+
+	mov     RSP_SAVE, %rsp
+
+	# restore callee-saved clobbered registers
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+
+	ret
+ENDPROC(sha1_x8_avx2)
+
+
+.data
+
+.align 32
+K00_19:
+.octa 0x5A8279995A8279995A8279995A827999
+.octa 0x5A8279995A8279995A8279995A827999
+K20_39:
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+K40_59:
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+K60_79:
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
-- 
cgit v1.2.3


From 8c603ff28659e65fdae960cd3f952ec168fc773a Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Mon, 27 Jun 2016 10:20:04 -0700
Subject: crypto: sha512-mb - SHA512 multibuffer job manager and glue code

This patch introduces the multi-buffer job manager which is responsible
for submitting scatter-gather buffers from several SHA512 jobs to the
multi-buffer algorithm. It also contains the flush routine that's called
by the crypto daemon to complete the job when no new jobs arrive before
the deadline of maximum latency of a SHA512 crypto job.

The SHA512 multi-buffer crypto algorithm is defined and initialized in this
patch.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile              |    1 +
 arch/x86/crypto/sha512-mb/Makefile    |   11 +
 arch/x86/crypto/sha512-mb/sha512_mb.c | 1043 +++++++++++++++++++++++++++++++++
 3 files changed, 1055 insertions(+)
 create mode 100644 arch/x86/crypto/sha512-mb/Makefile
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb.c

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 52c868523921..34b3fa2889d1 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,6 +51,7 @@ ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
 	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
 	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
+	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile
new file mode 100644
index 000000000000..0a57e2103980
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o
+	sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \
+	     sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o
+endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
new file mode 100644
index 000000000000..676f0f272f2b
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -0,0 +1,1043 @@
+/*
+ * Multi buffer SHA512 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha512_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha512_mb_alg_state;
+
+struct sha512_mb_ctx {
+	struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx)
+{
+	struct ahash_request *areq;
+
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+	return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+				struct ahash_request *areq)
+{
+	rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_submit)
+						(struct sha512_mb_mgr *state,
+						struct job_sha512 *job);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_flush)
+						(struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job)
+						(struct sha512_mb_mgr *state);
+
+inline void sha512_init_digest(uint64_t *digest)
+{
+	static const uint64_t initial_digest[SHA512_DIGEST_LENGTH] = {
+					SHA512_H0, SHA512_H1, SHA512_H2,
+					SHA512_H3, SHA512_H4, SHA512_H5,
+					SHA512_H6, SHA512_H7 };
+	memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2],
+			 uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memset(&padblock[i], 0, SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((SHA512_BLOCK_SIZE - 1) &
+	      (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1)))
+	     + 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+	/* Number of extra blocks to hash */
+	return i >> SHA512_LOG2_BLOCK_SIZE;
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
+		(struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			/* Clear PROCESSING bit */
+			ctx->status = HASH_CTX_STS_COMPLETE;
+			return ctx;
+		}
+
+		/*
+		 * If the extra blocks are empty, begin hashing what remains
+		 * in the user's buffer.
+		 */
+		if (ctx->partial_block_buffer_length == 0 &&
+		    ctx->incoming_buffer_length) {
+
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+			uint32_t copy_len;
+
+			/*
+			 * Only entire blocks can be hashed.
+			 * Copy remainder to extra blocks buffer.
+			 */
+			copy_len = len & (SHA512_BLOCK_SIZE-1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy(ctx->partial_block_buffer,
+				       ((const char *) buffer + len),
+				       copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			/* len should be a multiple of the block size now */
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			/* Set len to the number of blocks to be hashed */
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (struct sha512_hash_ctx *)
+					sha512_job_mgr_submit(&mgr->mgr,
+					&ctx->job);
+				continue;
+			}
+		}
+
+		/*
+		 * If the extra blocks are not empty, then we are
+		 * either on the last block(s) or we need more
+		 * user input before continuing.
+		 */
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks =
+					sha512_pad(buf, ctx->total_length);
+
+			ctx->status = (HASH_CTX_STS_PROCESSING |
+				       HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static struct sha512_hash_ctx
+		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
+{
+	/*
+	 * If get_comp_job returns NULL, there are no jobs complete.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user.
+	 * If it is not ready, resubmit the job to finish processing.
+	 * If sha512_ctx_mgr_resubmit returned a job, it is ready to be
+	 * returned.
+	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+	 * still need processing.
+	 */
+	struct sha512_hash_ctx *ctx;
+
+	ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_get_comp_job(&mgr->mgr);
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
+{
+	sha512_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha512_hash_ctx
+			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
+					  struct sha512_hash_ctx *ctx,
+					  const void *buffer,
+					  uint32_t len,
+					  int flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		/* User should not pass anything other than FIRST, UPDATE, or
+		 * LAST
+		 */
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		/* Cannot submit to a currently processing job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		/* Cannot update a finished job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+
+	if (flags & HASH_FIRST) {
+		/* Init digest */
+		sha512_init_digest(ctx->job.result_digest);
+
+		/* Reset byte counter */
+		ctx->total_length = 0;
+
+		/* Clear extra blocks */
+		ctx->partial_block_buffer_length = 0;
+	}
+
+	/* If we made it here, there were no errors during this call to
+	 * submit
+	 */
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	/* Store buffer ptr info from user */
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	/* Store the user's request flags and mark this ctx as currently being
+	 * processed.
+	 */
+	ctx->status = (flags & HASH_LAST) ?
+			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+			HASH_CTX_STS_PROCESSING;
+
+	/* Advance byte counter */
+	ctx->total_length += len;
+
+	/*
+	 * If there is anything currently buffered in the extra blocks,
+	 * append to it until it contains a whole block.
+	 * Or if the user's buffer contains less than a whole block,
+	 * append as much as possible to the extra block.
+	 */
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		/* Compute how many bytes to copy from user buffer into extra
+		 * block
+		 */
+		uint32_t copy_len = SHA512_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			/* Copy and update relevant pointers and counters */
+			memcpy
+		(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+				buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+
+		/* The extra block should never contain more than 1 block
+		 * here
+		 */
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		/* If the extra block buffer contains exactly 1 block, it can
+		 * be hashed.
+		 */
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
+{
+	struct sha512_hash_ctx *ctx;
+
+	while (1) {
+		ctx = (struct sha512_hash_ctx *)
+					sha512_job_mgr_flush(&mgr->mgr);
+
+		/* If flush returned 0, there are no more jobs in flight. */
+		if (!ctx)
+			return NULL;
+
+		/*
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
+		 */
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		/*
+		 * If sha512_ctx_mgr_resubmit returned a job, it is ready to
+		 * be returned. Otherwise, all jobs currently being managed by
+		 * the sha512_ctx_mgr still need processing. Loop.
+		 */
+		if (ctx)
+			return ctx;
+	}
+}
+
+static int sha512_mb_init(struct ahash_request *areq)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	hash_ctx_init(sctx);
+	sctx->job.result_digest[0] = SHA512_H0;
+	sctx->job.result_digest[1] = SHA512_H1;
+	sctx->job.result_digest[2] = SHA512_H2;
+	sctx->job.result_digest[3] = SHA512_H3;
+	sctx->job.result_digest[4] = SHA512_H4;
+	sctx->job.result_digest[5] = SHA512_H5;
+	sctx->job.result_digest[6] = SHA512_H6;
+	sctx->job.result_digest[7] = SHA512_H7;
+	sctx->total_length = 0;
+	sctx->partial_block_buffer_length = 0;
+	sctx->status = HASH_CTX_STS_IDLE;
+
+	return 0;
+}
+
+static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+	int	i;
+	struct	sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+	__be64	*dst = (__be64 *) rctx->out;
+
+	for (i = 0; i < 8; ++i)
+		dst[i] = cpu_to_be64(sctx->job.result_digest[i]);
+
+	return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+			struct mcryptd_alg_cstate *cstate, bool flush)
+{
+	int	flag = HASH_UPDATE;
+	int	nbytes, err = 0;
+	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+	struct sha512_hash_ctx *sha_ctx;
+
+	/* more work ? */
+	while (!(rctx->flag & HASH_DONE)) {
+		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+		if (nbytes < 0) {
+			err = nbytes;
+			goto out;
+		}
+		/* check if the walk is done */
+		if (crypto_ahash_walk_last(&rctx->walk)) {
+			rctx->flag |= HASH_DONE;
+			if (rctx->flag & HASH_FINAL)
+				flag |= HASH_LAST;
+
+		}
+		sha_ctx = (struct sha512_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
+		kernel_fpu_begin();
+		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
+		if (!sha_ctx) {
+			if (flush)
+				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
+		}
+		kernel_fpu_end();
+		if (sha_ctx)
+			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		else {
+			rctx = NULL;
+			goto out;
+		}
+	}
+
+	/* copy the results */
+	if (rctx->flag & HASH_FINAL)
+		sha512_mb_set_results(rctx);
+
+out:
+	*ret_rctx = rctx;
+	return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+			    struct mcryptd_alg_cstate *cstate,
+			    int err)
+{
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	struct mcryptd_hash_request_ctx *req_ctx;
+	int ret;
+
+	/* remove from work list */
+	spin_lock(&cstate->work_lock);
+	list_del(&rctx->waiter);
+	spin_unlock(&cstate->work_lock);
+
+	if (irqs_disabled())
+		rctx->complete(&req->base, err);
+	else {
+		local_bh_disable();
+		rctx->complete(&req->base, err);
+		local_bh_enable();
+	}
+
+	/* check to see if there are other jobs that are done */
+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	while (sha_ctx) {
+		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		ret = sha_finish_walk(&req_ctx, cstate, false);
+		if (req_ctx) {
+			spin_lock(&cstate->work_lock);
+			list_del(&req_ctx->waiter);
+			spin_unlock(&cstate->work_lock);
+
+			req = cast_mcryptd_ctx_to_req(req_ctx);
+			if (irqs_disabled())
+				rctx->complete(&req->base, ret);
+			else {
+				local_bh_disable();
+				rctx->complete(&req->base, ret);
+				local_bh_enable();
+			}
+		}
+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	}
+
+	return 0;
+}
+
+static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+			     struct mcryptd_alg_cstate *cstate)
+{
+	unsigned long next_flush;
+	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+	/* initialize tag */
+	rctx->tag.arrival = jiffies;    /* tag the arrival time */
+	rctx->tag.seq_num = cstate->next_seq_num++;
+	next_flush = rctx->tag.arrival + delay;
+	rctx->tag.expire = next_flush;
+
+	spin_lock(&cstate->work_lock);
+	list_add_tail(&rctx->waiter, &cstate->work_list);
+	spin_unlock(&cstate->work_lock);
+
+	mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha512_mb_update(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0, nbytes;
+
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk))
+		rctx->flag |= HASH_DONE;
+
+	/* submit */
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	sha512_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_finup(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0, flag = HASH_UPDATE, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk)) {
+		rctx->flag |= HASH_DONE;
+		flag = HASH_LAST;
+	}
+
+	/* submit */
+	rctx->flag |= HASH_FINAL;
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	sha512_mb_add_list(rctx, cstate);
+
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_final(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0;
+	u8 data;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	rctx->flag |= HASH_DONE | HASH_FINAL;
+
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	/* flag HASH_FINAL and 0 data size */
+	sha512_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_export(struct ahash_request *areq, void *out)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_mb_import(struct ahash_request *areq, const void *in)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
+
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha512_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha512_hash_ctx));
+
+	return 0;
+}
+
+static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha512_mb_areq_alg = {
+	.init		=	sha512_mb_init,
+	.update		=	sha512_mb_update,
+	.final		=	sha512_mb_final,
+	.finup		=	sha512_mb_finup,
+	.export		=	sha512_mb_export,
+	.import		=	sha512_mb_import,
+	.halg		=	{
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.statesize	=	sizeof(struct sha512_hash_ctx),
+	.base		=	{
+			.cra_name	 = "__sha512-mb",
+			.cra_driver_name = "__intel_sha512-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA512_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha512_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha512_mb_areq_init_tfm,
+			.cra_exit	= sha512_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha512_hash_ctx),
+		}
+	}
+};
+
+static int sha512_mb_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha512_mb_async_update(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha512_mb_async_finup(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha512_mb_async_final(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha512_mb_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha512_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha512_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct ahash_request *areq;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+
+	areq = &rctx->areq;
+
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha512_mb_async_alg = {
+	.init           = sha512_mb_async_init,
+	.update         = sha512_mb_async_update,
+	.final          = sha512_mb_async_final,
+	.finup          = sha512_mb_async_finup,
+	.digest         = sha512_mb_async_digest,
+	.export		= sha512_mb_async_export,
+	.import		= sha512_mb_async_import,
+	.halg = {
+		.digestsize     = SHA512_DIGEST_SIZE,
+		.statesize      = sizeof(struct sha512_hash_ctx),
+		.base = {
+			.cra_name               = "sha512",
+			.cra_driver_name        = "sha512_mb",
+			.cra_priority           = 200,
+			.cra_flags              = CRYPTO_ALG_TYPE_AHASH |
+							CRYPTO_ALG_ASYNC,
+			.cra_blocksize          = SHA512_BLOCK_SIZE,
+			.cra_type               = &crypto_ahash_type,
+			.cra_module             = THIS_MODULE,
+			.cra_list               = LIST_HEAD_INIT
+				(sha512_mb_async_alg.halg.base.cra_list),
+			.cra_init               = sha512_mb_async_init_tfm,
+			.cra_exit               = sha512_mb_async_exit_tfm,
+			.cra_ctxsize		= sizeof(struct sha512_mb_ctx),
+			.cra_alignmask		= 0,
+		},
+	},
+};
+
+static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+	struct mcryptd_hash_request_ctx *rctx;
+	unsigned long cur_time;
+	unsigned long next_flush = 0;
+	struct sha512_hash_ctx *sha_ctx;
+
+
+	cur_time = jiffies;
+
+	while (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		if time_before(cur_time, rctx->tag.expire)
+			break;
+		kernel_fpu_begin();
+		sha_ctx = (struct sha512_hash_ctx *)
+					sha512_ctx_mgr_flush(cstate->mgr);
+		kernel_fpu_end();
+		if (!sha_ctx) {
+			pr_err("sha512_mb error: nothing got flushed for"
+							" non-empty list\n");
+			break;
+		}
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		sha_finish_walk(&rctx, cstate, true);
+		sha_complete_job(rctx, cstate, 0);
+	}
+
+	if (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		/* get the hash context and then flush time */
+		next_flush = rctx->tag.expire;
+		mcryptd_arm_flusher(cstate, get_delay(next_flush));
+	}
+	return next_flush;
+}
+
+static int __init sha512_mb_mod_init(void)
+{
+
+	int cpu;
+	int err;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	/* check for dependent cpu features */
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_BMI2))
+		return -ENODEV;
+
+	/* initialize multibuffer structures */
+	sha512_mb_alg_state.alg_cstate =
+				alloc_percpu(struct mcryptd_alg_cstate);
+
+	sha512_job_mgr_init = sha512_mb_mgr_init_avx2;
+	sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2;
+	sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2;
+	sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2;
+
+	if (!sha512_mb_alg_state.alg_cstate)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		cpu_state->next_flush = 0;
+		cpu_state->next_seq_num = 0;
+		cpu_state->flusher_engaged = false;
+		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+		cpu_state->cpu = cpu;
+		cpu_state->alg_state = &sha512_mb_alg_state;
+		cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr),
+								GFP_KERNEL);
+		if (!cpu_state->mgr)
+			goto err2;
+		sha512_ctx_mgr_init(cpu_state->mgr);
+		INIT_LIST_HEAD(&cpu_state->work_list);
+		spin_lock_init(&cpu_state->work_lock);
+	}
+	sha512_mb_alg_state.flusher = &sha512_mb_flusher;
+
+	err = crypto_register_ahash(&sha512_mb_areq_alg);
+	if (err)
+		goto err2;
+	err = crypto_register_ahash(&sha512_mb_async_alg);
+	if (err)
+		goto err1;
+
+
+	return 0;
+err1:
+	crypto_unregister_ahash(&sha512_mb_areq_alg);
+err2:
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha512_mb_alg_state.alg_cstate);
+	return -ENODEV;
+}
+
+static void __exit sha512_mb_mod_fini(void)
+{
+	int cpu;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	crypto_unregister_ahash(&sha512_mb_async_alg);
+	crypto_unregister_ahash(&sha512_mb_areq_alg);
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha512_mb_alg_state.alg_cstate);
+}
+
+module_init(sha512_mb_mod_init);
+module_exit(sha512_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS("sha512");
-- 
cgit v1.2.3


From 45691e2d9b18a0a1675b2c7504847f7c228f7657 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Mon, 27 Jun 2016 10:20:06 -0700
Subject: crypto: sha512-mb - submit/flush routines for AVX2

This patch introduces the routines used to submit and flush buffers
belonging to SHA512 crypto jobs to the SHA512 multibuffer algorithm.
It is implemented mostly in assembly optimized with AVX2 instructions.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 .../crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S    | 291 +++++++++++++++++++++
 .../x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c |  67 +++++
 .../crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S   | 222 ++++++++++++++++
 3 files changed, 580 insertions(+)
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..3ddba19a0db6
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -0,0 +1,291 @@
+/*
+ * Flush routine for SHA512 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+# LINUX register definitions
+#define arg1    %rdi
+#define arg2    %rsi
+
+# idx needs to be other than arg1, arg2, rbx, r12
+#define idx     %rdx
+
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+
+#define unused_lanes    %rbx
+#define lane_data       %rbx
+#define tmp2            %rbx
+
+#define job_rax         %rax
+#define tmp1            %rax
+#define size_offset     %rax
+#define tmp             %rax
+#define start_offset    %rax
+
+#define tmp3            arg1
+
+#define extra_blocks    arg2
+#define p               arg2
+
+#define tmp4            %r8
+#define lens0           %r8
+
+#define lens1           %r9
+#define lens2           %r10
+#define lens3           %r11
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha512_mb_mgr_flush_avx2)
+	FRAME_BEGIN
+	push	%rbx
+
+	# If bit (32+3) is set, then all lanes are empty
+	mov     _unused_lanes(state), unused_lanes
+        bt      $32+7, unused_lanes
+        jc      return_null
+
+        # find a lane with a non-null job
+	xor     idx, idx
+        offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  one(%rip), idx
+        offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  two(%rip), idx
+        offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  three(%rip), idx
+
+        # copy idx to empty lanes
+copy_lane_data:
+	offset =  (_args + _data_ptr)
+        mov     offset(state,idx,8), tmp
+
+        I = 0
+.rep 4
+	offset =  (_ldata + I * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+.altmacro
+        JNE_SKIP %I
+        offset =  (_args + _data_ptr + 8*I)
+        mov     tmp, offset(state)
+        offset =  (_lens + 8*I +4)
+        movl    $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+        I = (I+1)
+.noaltmacro
+.endr
+
+        # Find min length
+        mov     _lens + 0*8(state),lens0
+        mov     lens0,idx
+        mov     _lens + 1*8(state),lens1
+        cmp     idx,lens1
+        cmovb   lens1,idx
+        mov     _lens + 2*8(state),lens2
+        cmp     idx,lens2
+        cmovb   lens2,idx
+        mov     _lens + 3*8(state),lens3
+        cmp     idx,lens3
+        cmovb   lens3,idx
+        mov     idx,len2
+        and     $0xF,idx
+        and     $~0xFF,len2
+	jz      len_is_0
+
+        sub     len2, lens0
+        sub     len2, lens1
+        sub     len2, lens2
+        sub     len2, lens3
+        shr     $32,len2
+        mov     lens0, _lens + 0*8(state)
+        mov     lens1, _lens + 1*8(state)
+        mov     lens2, _lens + 2*8(state)
+        mov     lens3, _lens + 3*8(state)
+
+        # "state" and "args" are the same address, arg1
+        # len is arg2
+        call    sha512_x4_avx2
+        # state and idx are intact
+
+len_is_0:
+        # process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0,  _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $8, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+
+	movl    $0xFFFFFFFF, _lens+4(state,  idx, 8)
+
+	vmovq _args_digest+0*32(state, idx, 8), %xmm0
+        vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+	vmovq _args_digest+2*32(state, idx, 8), %xmm1
+        vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+	vmovq _args_digest+4*32(state, idx, 8), %xmm2
+        vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+	vmovq _args_digest+6*32(state, idx, 8), %xmm3
+	vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+	vmovdqu %xmm0, _result_digest(job_rax)
+	vmovdqu %xmm1, _result_digest+1*16(job_rax)
+	vmovdqu %xmm2, _result_digest+2*16(job_rax)
+	vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+return:
+	pop	%rbx
+	FRAME_END
+        ret
+
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+ENDPROC(sha512_mb_mgr_flush_avx2)
+.align 16
+
+ENTRY(sha512_mb_mgr_get_comp_job_avx2)
+        push    %rbx
+
+	mov     _unused_lanes(state), unused_lanes
+        bt      $(32+7), unused_lanes
+        jc      .return_null
+
+        # Find min length
+        mov     _lens(state),lens0
+        mov     lens0,idx
+        mov     _lens+1*8(state),lens1
+        cmp     idx,lens1
+        cmovb   lens1,idx
+        mov     _lens+2*8(state),lens2
+        cmp     idx,lens2
+        cmovb   lens2,idx
+        mov     _lens+3*8(state),lens3
+        cmp     idx,lens3
+        cmovb   lens3,idx
+        test    $~0xF,idx
+        jnz     .return_null
+        and     $0xF,idx
+
+        #process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0,  _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $8, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+
+        movl    $0xFFFFFFFF, _lens+4(state,  idx, 8)
+
+	vmovq   _args_digest(state, idx, 8), %xmm0
+        vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+	vmovq    _args_digest+2*32(state, idx, 8), %xmm1
+        vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+	vmovq    _args_digest+4*32(state, idx, 8), %xmm2
+        vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+        vmovq    _args_digest+6*32(state, idx, 8), %xmm3
+        vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+	vmovdqu %xmm0, _result_digest+0*16(job_rax)
+	vmovdqu %xmm1, _result_digest+1*16(job_rax)
+	vmovdqu %xmm2, _result_digest+2*16(job_rax)
+	vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+	pop     %rbx
+
+        ret
+
+.return_null:
+        xor     job_rax, job_rax
+	pop     %rbx
+        ret
+ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
+.data
+
+.align 16
+one:
+.quad  1
+two:
+.quad  2
+three:
+.quad  3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..36870b26067a
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,67 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha512_mb_mgr.h"
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->lens[2] = 2;
+	state->lens[3] = 3;
+	state->unused_lanes = 0xFF03020100;
+	for (j = 0; j < 4; j++)
+		state->ldata[j].job_in_lane = NULL;
+}
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..815f07bdd1f8
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -0,0 +1,222 @@
+/*
+ * Buffer submit code for multi buffer SHA512 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+#define arg1    %rdi
+#define arg2    %rsi
+
+#define idx             %rdx
+#define last_len        %rdx
+
+#define size_offset     %rcx
+#define tmp2            %rcx
+
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+#define p2      arg2
+
+#define p               %r11
+#define start_offset    %r11
+
+#define unused_lanes    %rbx
+
+#define job_rax         %rax
+#define len             %rax
+
+#define lane            %r12
+#define tmp3            %r12
+#define lens3           %r12
+
+#define extra_blocks    %r8
+#define lens0           %r8
+
+#define tmp             %r9
+#define lens1           %r9
+
+#define lane_data       %r10
+#define lens2           %r10
+
+#define DWORD_len %eax
+
+# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha512_mb_mgr_submit_avx2)
+	FRAME_BEGIN
+	push	%rbx
+	push	%r12
+
+        mov     _unused_lanes(state), unused_lanes
+        movzb     %bl,lane
+        shr     $8, unused_lanes
+        imul    $_LANE_DATA_size, lane,lane_data
+        movl    $STS_BEING_PROCESSED, _status(job)
+	lea     _ldata(state, lane_data), lane_data
+        mov     unused_lanes, _unused_lanes(state)
+        movl    _len(job),  DWORD_len
+
+	mov     job, _job_in_lane(lane_data)
+        movl    DWORD_len,_lens+4(state , lane, 8)
+
+	# Load digest words from result_digest
+	vmovdqu	_result_digest+0*16(job), %xmm0
+	vmovdqu _result_digest+1*16(job), %xmm1
+	vmovdqu	_result_digest+2*16(job), %xmm2
+        vmovdqu	_result_digest+3*16(job), %xmm3
+
+	vmovq    %xmm0, _args_digest(state, lane, 8)
+	vpextrq  $1, %xmm0, _args_digest+1*32(state , lane, 8)
+	vmovq    %xmm1, _args_digest+2*32(state , lane, 8)
+	vpextrq  $1, %xmm1, _args_digest+3*32(state , lane, 8)
+	vmovq    %xmm2, _args_digest+4*32(state , lane, 8)
+	vpextrq  $1, %xmm2, _args_digest+5*32(state , lane, 8)
+	vmovq    %xmm3, _args_digest+6*32(state , lane, 8)
+	vpextrq  $1, %xmm3, _args_digest+7*32(state , lane, 8)
+
+	mov     _buffer(job), p
+	mov     p, _args_data_ptr(state, lane, 8)
+
+	cmp     $0xFF, unused_lanes
+	jne     return_null
+
+start_loop:
+
+	# Find min length
+	mov     _lens+0*8(state),lens0
+	mov     lens0,idx
+	mov     _lens+1*8(state),lens1
+	cmp     idx,lens1
+	cmovb   lens1, idx
+	mov     _lens+2*8(state),lens2
+	cmp     idx,lens2
+	cmovb   lens2,idx
+	mov     _lens+3*8(state),lens3
+	cmp     idx,lens3
+	cmovb   lens3,idx
+	mov     idx,len2
+	and     $0xF,idx
+	and     $~0xFF,len2
+	jz      len_is_0
+
+	sub     len2,lens0
+	sub     len2,lens1
+	sub     len2,lens2
+	sub     len2,lens3
+	shr     $32,len2
+	mov     lens0, _lens + 0*8(state)
+	mov     lens1, _lens + 1*8(state)
+	mov     lens2, _lens + 2*8(state)
+	mov     lens3, _lens + 3*8(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call    sha512_x4_avx2
+	# state and idx are intact
+
+len_is_0:
+
+	# process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+	lea     _ldata(state, lane_data), lane_data
+
+	mov     _job_in_lane(lane_data), job_rax
+	mov     _unused_lanes(state), unused_lanes
+	movq    $0, _job_in_lane(lane_data)
+	movl    $STS_COMPLETED, _status(job_rax)
+	shl     $8, unused_lanes
+	or      idx, unused_lanes
+	mov     unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF,_lens+4(state,idx,8)
+	vmovq    _args_digest+0*32(state , idx, 8), %xmm0
+	vpinsrq  $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
+	vmovq    _args_digest+2*32(state , idx, 8), %xmm1
+	vpinsrq  $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
+	vmovq    _args_digest+4*32(state , idx, 8), %xmm2
+	vpinsrq  $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
+	vmovq    _args_digest+6*32(state , idx, 8), %xmm3
+	vpinsrq  $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
+
+	vmovdqu  %xmm0, _result_digest + 0*16(job_rax)
+	vmovdqu  %xmm1, _result_digest + 1*16(job_rax)
+	vmovdqu  %xmm2, _result_digest + 2*16(job_rax)
+	vmovdqu  %xmm3, _result_digest + 3*16(job_rax)
+
+return:
+	pop	%r12
+	pop	%rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+ENDPROC(sha512_mb_mgr_submit_avx2)
+.data
+
+.align 16
+H0:     .int  0x6a09e667
+H1:     .int  0xbb67ae85
+H2:     .int  0x3c6ef372
+H3:     .int  0xa54ff53a
+H4:     .int  0x510e527f
+H5:     .int  0x9b05688c
+H6:     .int  0x1f83d9ab
+H7:     .int  0x5be0cd19
-- 
cgit v1.2.3


From 2cdacb68d70d3c2f0fc41108619d51bc256df10a Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Mon, 27 Jun 2016 10:20:07 -0700
Subject: crypto: sha512-mb - Algorithm data structures

This patch introduces the data structures and prototypes of functions
needed for computing SHA512 hash using multi-buffer. Included are the
structures of the multi-buffer SHA512 job, job scheduler in C and x86
assembly.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-mb/sha512_mb_ctx.h          | 130 ++++++++++
 arch/x86/crypto/sha512-mb/sha512_mb_mgr.h          | 104 ++++++++
 .../crypto/sha512-mb/sha512_mb_mgr_datastruct.S    | 281 +++++++++++++++++++++
 3 files changed, 515 insertions(+)
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
new file mode 100644
index 000000000000..9d4b2c8208d5
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
@@ -0,0 +1,130 @@
+/*
+ * Header file for multi buffer SHA512 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha512_mb_mgr.h"
+
+#define HASH_UPDATE          0x00
+#define HASH_FIRST           0x01
+#define HASH_LAST            0x02
+#define HASH_ENTIRE          0x03
+#define HASH_DONE            0x04
+#define HASH_FINAL           0x08
+
+#define HASH_CTX_STS_IDLE       0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST       0x02
+#define HASH_CTX_STS_COMPLETE   0x04
+
+enum hash_ctx_error {
+	HASH_CTX_ERROR_NONE               =  0,
+	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
+	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
+};
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while (0)
+
+/* Hash Constants and Typedefs */
+#define SHA512_DIGEST_LENGTH          8
+#define SHA512_LOG2_BLOCK_SIZE        7
+
+#define SHA512_PADLENGTHFIELD_SIZE    16
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+	if (unlikely(!(expr))) { \
+		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+		#expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha512_ctx_mgr {
+	struct sha512_mb_mgr mgr;
+};
+
+/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */
+
+struct sha512_hash_ctx {
+	/* Must be at struct offset 0 */
+	struct job_sha512       job;
+	/* status flag */
+	int status;
+	/* error flag */
+	int error;
+
+	uint32_t        total_length;
+	const void      *incoming_buffer;
+	uint32_t        incoming_buffer_length;
+	uint8_t         partial_block_buffer[SHA512_BLOCK_SIZE * 2];
+	uint32_t        partial_block_buffer_length;
+	void            *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
new file mode 100644
index 000000000000..178f17eef382
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
@@ -0,0 +1,104 @@
+/*
+ * Header file for multi buffer SHA512 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA512_DIGEST_WORDS 8
+
+enum job_sts {STS_UNKNOWN = 0,
+	STS_BEING_PROCESSED = 1,
+	STS_COMPLETED =       2,
+	STS_INTERNAL_ERROR = 3,
+	STS_ERROR = 4
+};
+
+struct job_sha512 {
+	u8  *buffer;
+	u64  len;
+	u64  result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32);
+	enum job_sts status;
+	void   *user_data;
+};
+
+struct sha512_args_x4 {
+	uint64_t        digest[8][4];
+	uint8_t         *data_ptr[4];
+};
+
+struct sha512_lane_data {
+	struct job_sha512 *job_in_lane;
+};
+
+struct sha512_mb_mgr {
+	struct sha512_args_x4 args;
+
+	uint64_t lens[4];
+
+	/* each byte is index (0...7) of unused lanes */
+	uint64_t unused_lanes;
+	/* byte 4 is set to FF as a flag */
+	struct sha512_lane_data ldata[4];
+};
+
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state,
+						struct job_sha512 *job);
+struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..cf2636d4c9ba
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
@@ -0,0 +1,281 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS   # JOB_AES
+###     name            size    align
+#FIELD  _plaintext,     8,      8       # pointer to plaintext
+#FIELD  _ciphertext,    8,      8       # pointer to ciphertext
+#FIELD  _IV,            16,     8       # IV
+#FIELD  _keys,          8,      8       # pointer to keys
+#FIELD  _len,           4,      4       # length in bytes
+#FIELD  _status,        4,      4       # status enumeration
+#FIELD  _user_data,     8,      8       # pointer to user data
+#UNION  _union,         size1,  align1, \
+#                       size2,  align2, \
+#                       size3,  align3, \
+#                       ...
+#END_FIELDS
+#%assign _JOB_AES_size  _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+#       STRUCT job_aes2
+#       RES_Q   .plaintext,     1
+#       RES_Q   .ciphertext,    1
+#       RES_DQ  .IV,            1
+#       RES_B   .nested,        _JOB_AES_SIZE, _JOB_AES_ALIGN
+#       RES_U   .union,         size1, align1, \
+#                               size2, align2, \
+#                               ...
+#       ENDSTRUCT
+#       # Following only needed if nesting
+#       %assign job_aes2_size   _FIELD_OFFSET
+#       %assign job_aes2_align  _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro    Base size
+# RES_B     1
+# RES_W     2
+# RES_D     4
+# RES_Q     8
+# RES_DQ   16
+# RES_Y    32
+# RES_Z    64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define PTR_SZ                  8
+#define SHA512_DIGEST_WORD_SIZE 8
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+#define NUM_SHA512_DIGEST_WORDS 8
+#define SZ4                     4*SHA512_DIGEST_WORD_SIZE
+#define ROUNDS                  80*SZ4
+#define SHA512_DIGEST_ROW_SIZE  (SHA512_MB_MGR_NUM_LANES_AVX2 * 8)
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name  = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - ##tmp)
+.if (tmp > 0)
+        .lcomm  tmp
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+###################################################################
+### Define SHA512 Out Of Order Data Structures
+###################################################################
+
+START_FIELDS    # LANE_DATA
+###     name            size    align
+FIELD   _job_in_lane,   8,      8       # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+####################################################################
+
+START_FIELDS    # SHA512_ARGS_X4
+###     name            size    align
+FIELD   _digest,        8*8*4,  4      # transposed digest
+FIELD   _data_ptr,      8*4,    8       # array of pointers to data
+END_FIELDS
+
+ _SHA512_ARGS_X4_size  =  _FIELD_OFFSET
+ _SHA512_ARGS_X4_align =  _STRUCT_ALIGN
+
+#####################################################################
+
+START_FIELDS    # MB_MGR
+###     name            size    align
+FIELD   _args,          _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD   _lens,          8*4,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*4, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size  =  _FIELD_OFFSET
+ _MB_MGR_align =  _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+#######################################################################
+
+#######################################################################
+#### Define constants
+#######################################################################
+
+#define STS_UNKNOWN             0
+#define STS_BEING_PROCESSED     1
+#define STS_COMPLETED           2
+
+#######################################################################
+#### Define JOB_SHA512 structure
+#######################################################################
+
+START_FIELDS    # JOB_SHA512
+###     name                            size    align
+FIELD   _buffer,                        8,      8       # pointer to buffer
+FIELD   _len,                           8,      8       # length in bytes
+FIELD   _result_digest,                 8*8,    32      # Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+ _JOB_SHA512_size = _FIELD_OFFSET
+ _JOB_SHA512_align = _STRUCT_ALIGN
-- 
cgit v1.2.3


From bee5cfd9f6702fa93f183a289dda798042bde489 Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Mon, 27 Jun 2016 10:20:08 -0700
Subject: crypto: sha512-mb - Crypto computation (x4 AVX2)

This patch introduces the assembly routines to do SHA512 computation on
buffers belonging to several jobs at once. The assembly routines are
optimized with AVX2 instructions that have 4 data lanes and using AVX2
registers.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-mb/sha512_x4_avx2.S | 529 +++++++++++++++++++++++++++++
 1 file changed, 529 insertions(+)
 create mode 100644 arch/x86/crypto/sha512-mb/sha512_x4_avx2.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
new file mode 100644
index 000000000000..31ab1eff6413
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -0,0 +1,529 @@
+/*
+ * Multi-buffer SHA512 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# code to compute quad SHA512 using AVX2
+# use YMMs to tackle the larger digest size
+# outer calling routine takes care of save and restore of XMM registers
+# Logic designed/laid out by JDG
+
+# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+# Stack must be aligned to 32 bytes before call
+# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
+# Linux preserves: rcx rdx rdi rbp r13 r14 r15
+# clobbers ymm0-15
+
+#include <linux/linkage.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+arg1 = %rdi
+arg2 = %rsi
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = %r8
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+a0 = %ymm8
+a1 = %ymm9
+a2 = %ymm10
+
+TT0 = %ymm14
+TT1 = %ymm13
+TT2 = %ymm12
+TT3 = %ymm11
+TT4 = %ymm10
+TT5 = %ymm9
+
+T1 = %ymm14
+TMP = %ymm15
+
+# Define stack usage
+STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24
+
+#define VMOVPD	vmovupd
+_digest = SZ4*16
+
+# transpose r0, r1, r2, r3, t0, t1
+# "transpose" data in {r0..r3} using temps {t0..t3}
+# Input looks like: {r0 r1 r2 r3}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+#
+# output looks like: {t0 r1 r0 r3}
+# t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+# r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+# r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+# r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+.macro TRANSPOSE r0 r1 r2 r3 t0 t1
+	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+        vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+        vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+        vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vperm2f128      $0x20, \r2, \r0, \r1  # h6...a6
+        vperm2f128      $0x31, \r2, \r0, \r3  # h2...a2
+        vperm2f128      $0x31, \t1, \t0, \r0  # h5...a5
+        vperm2f128      $0x20, \t1, \t0, \t0  # h1...a1
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+# PRORQ reg, imm, tmp
+# packed-rotate-right-double
+# does a rotate by doing two shifts and an or
+.macro _PRORQ reg imm tmp
+	vpsllq	$(64-\imm),\reg,\tmp
+	vpsrlq	$\imm,\reg, \reg
+	vpor	\tmp,\reg, \reg
+.endm
+
+# non-destructive
+# PRORQ_nd reg, imm, tmp, src
+.macro _PRORQ_nd reg imm tmp src
+	vpsllq	$(64-\imm), \src, \tmp
+	vpsrlq	$\imm, \src, \reg
+	vpor	\tmp, \reg, \reg
+.endm
+
+# PRORQ dst/src, amt
+.macro PRORQ reg imm
+	_PRORQ	\reg, \imm, TMP
+.endm
+
+# PRORQ_nd dst, src, amt
+.macro PRORQ_nd reg tmp imm
+	_PRORQ_nd	\reg, \imm, TMP, \tmp
+.endm
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+	PRORQ_nd a0, e, (18-14)	# sig1: a0 = (e >> 4)
+
+	vpxor   g, f, a2        # ch: a2 = f^g
+        vpand   e,a2, a2                # ch: a2 = (f^g)&e
+        vpxor   g, a2, a2               # a2 = ch
+
+        PRORQ_nd        a1,e,41         # sig1: a1 = (e >> 25)
+
+        offset = SZ4*(\i & 0xf)
+        vmovdqu \_T1,offset(%rsp)
+        vpaddq  (TBL,ROUND,1), \_T1, \_T1       # T1 = W + K
+        vpxor   e,a0, a0        # sig1: a0 = e ^ (e >> 5)
+        PRORQ   a0, 14           # sig1: a0 = (e >> 6) ^ (e >> 11)
+        vpaddq  a2, h, h        # h = h + ch
+        PRORQ_nd        a2,a,6  # sig0: a2 = (a >> 11)
+        vpaddq  \_T1,h, h       # h = h + ch + W + K
+        vpxor   a1, a0, a0      # a0 = sigma1
+	vmovdqu a,\_T1
+        PRORQ_nd        a1,a,39 # sig0: a1 = (a >> 22)
+        vpxor   c, \_T1, \_T1      # maj: T1 = a^c
+        add     $SZ4, ROUND     # ROUND++
+        vpand   b, \_T1, \_T1   # maj: T1 = (a^c)&b
+        vpaddq  a0, h, h
+        vpaddq  h, d, d
+        vpxor   a, a2, a2       # sig0: a2 = a ^ (a >> 11)
+        PRORQ   a2,28            # sig0: a2 = (a >> 2) ^ (a >> 13)
+        vpxor   a1, a2, a2      # a2 = sig0
+        vpand   c, a, a1        # maj: a1 = a&c
+        vpor    \_T1, a1, a1    # a1 = maj
+        vpaddq  a1, h, h        # h = h + ch + W + K + maj
+        vpaddq  a2, h, h        # h = h + ch + W + K + maj + sigma0
+        ROTATE_ARGS
+.endm
+
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+	vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1
+        vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1
+        vmovdqu \_T1, a0
+        PRORQ   \_T1,7
+        vmovdqu a1, a2
+        PRORQ   a1,42
+        vpxor   a0, \_T1, \_T1
+        PRORQ   \_T1, 1
+        vpxor   a2, a1, a1
+        PRORQ   a1, 19
+        vpsrlq  $7, a0, a0
+        vpxor   a0, \_T1, \_T1
+        vpsrlq  $6, a2, a2
+        vpxor   a2, a1, a1
+        vpaddq  SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1
+        vpaddq  SZ4*((\i-7)&0xf)(%rsp), a1, a1
+        vpaddq  a1, \_T1, \_T1
+
+        ROUND_00_15 \_T1,\i
+.endm
+
+
+# void sha512_x4_avx2(void *STATE, const int INP_SIZE)
+# arg 1 : STATE    : pointer to input data
+# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+ENTRY(sha512_x4_avx2)
+	# general registers preserved in outer calling routine
+	# outer calling routine saves all the XMM registers
+	# save callee-saved clobbered registers to comply with C function ABI
+	push    %r12
+	push    %r13
+	push    %r14
+	push    %r15
+
+	sub     $STACK_SPACE1, %rsp
+
+        # Load the pre-transposed incoming digest.
+        vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a
+        vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b
+        vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c
+        vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d
+        vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e
+        vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f
+        vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g
+        vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h
+
+        lea     K512_4(%rip),TBL
+
+        # load the address of each of the 4 message lanes
+        # getting ready to transpose input onto stack
+        mov     _data_ptr+0*PTR_SZ(STATE),inp0
+        mov     _data_ptr+1*PTR_SZ(STATE),inp1
+        mov     _data_ptr+2*PTR_SZ(STATE),inp2
+        mov     _data_ptr+3*PTR_SZ(STATE),inp3
+
+        xor     IDX, IDX
+lloop:
+        xor     ROUND, ROUND
+
+	# save old digest
+        vmovdqu a, _digest(%rsp)
+        vmovdqu b, _digest+1*SZ4(%rsp)
+        vmovdqu c, _digest+2*SZ4(%rsp)
+        vmovdqu d, _digest+3*SZ4(%rsp)
+        vmovdqu e, _digest+4*SZ4(%rsp)
+        vmovdqu f, _digest+5*SZ4(%rsp)
+        vmovdqu g, _digest+6*SZ4(%rsp)
+        vmovdqu h, _digest+7*SZ4(%rsp)
+        i = 0
+.rep 4
+	vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP
+        VMOVPD  i*32(inp0, IDX), TT2
+        VMOVPD  i*32(inp1, IDX), TT1
+        VMOVPD  i*32(inp2, IDX), TT4
+        VMOVPD  i*32(inp3, IDX), TT3
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TMP, TT0, TT0
+	vpshufb	TMP, TT1, TT1
+	vpshufb	TMP, TT2, TT2
+	vpshufb	TMP, TT3, TT3
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+	i = (i+1)
+.endr
+        add     $128, IDX
+
+        i = (i*4)
+
+        jmp     Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+        ROUND_16_XX     T1, i
+        i = (i+1)
+.endr
+        cmp     $0xa00,ROUND
+        jb      Lrounds_16_xx
+
+	# add old digest
+        vpaddq  _digest(%rsp), a, a
+        vpaddq  _digest+1*SZ4(%rsp), b, b
+        vpaddq  _digest+2*SZ4(%rsp), c, c
+        vpaddq  _digest+3*SZ4(%rsp), d, d
+        vpaddq  _digest+4*SZ4(%rsp), e, e
+        vpaddq  _digest+5*SZ4(%rsp), f, f
+        vpaddq  _digest+6*SZ4(%rsp), g, g
+        vpaddq  _digest+7*SZ4(%rsp), h, h
+
+        sub     $1, INP_SIZE  # unit is blocks
+        jne     lloop
+
+        # write back to memory (state object) the transposed digest
+        vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE)
+
+	# update input data pointers
+	add     IDX, inp0
+        mov     inp0, _data_ptr+0*PTR_SZ(STATE)
+        add     IDX, inp1
+        mov     inp1, _data_ptr+1*PTR_SZ(STATE)
+        add     IDX, inp2
+        mov     inp2, _data_ptr+2*PTR_SZ(STATE)
+        add     IDX, inp3
+        mov     inp3, _data_ptr+3*PTR_SZ(STATE)
+
+	#;;;;;;;;;;;;;;;
+	#; Postamble
+	add $STACK_SPACE1, %rsp
+	# restore callee-saved clobbered registers
+
+	pop     %r15
+	pop     %r14
+	pop     %r13
+	pop     %r12
+
+	# outer calling routine restores XMM and other GP registers
+	ret
+ENDPROC(sha512_x4_avx2)
+
+.data
+.align 64
+K512_4:
+	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
+		0x428a2f98d728ae22428a2f98d728ae22
+	.octa 0x7137449123ef65cd7137449123ef65cd,\
+		0x7137449123ef65cd7137449123ef65cd
+	.octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\
+		0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f
+	.octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\
+		0xe9b5dba58189dbbce9b5dba58189dbbc
+	.octa 0x3956c25bf348b5383956c25bf348b538,\
+		0x3956c25bf348b5383956c25bf348b538
+	.octa 0x59f111f1b605d01959f111f1b605d019,\
+		0x59f111f1b605d01959f111f1b605d019
+	.octa 0x923f82a4af194f9b923f82a4af194f9b,\
+		0x923f82a4af194f9b923f82a4af194f9b
+	.octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\
+		0xab1c5ed5da6d8118ab1c5ed5da6d8118
+	.octa 0xd807aa98a3030242d807aa98a3030242,\
+		0xd807aa98a3030242d807aa98a3030242
+	.octa 0x12835b0145706fbe12835b0145706fbe,\
+		0x12835b0145706fbe12835b0145706fbe
+	.octa 0x243185be4ee4b28c243185be4ee4b28c,\
+		0x243185be4ee4b28c243185be4ee4b28c
+	.octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\
+		0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2
+	.octa 0x72be5d74f27b896f72be5d74f27b896f,\
+		0x72be5d74f27b896f72be5d74f27b896f
+	.octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\
+		0x80deb1fe3b1696b180deb1fe3b1696b1
+	.octa 0x9bdc06a725c712359bdc06a725c71235,\
+		0x9bdc06a725c712359bdc06a725c71235
+	.octa 0xc19bf174cf692694c19bf174cf692694,\
+		0xc19bf174cf692694c19bf174cf692694
+	.octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\
+		0xe49b69c19ef14ad2e49b69c19ef14ad2
+	.octa 0xefbe4786384f25e3efbe4786384f25e3,\
+		0xefbe4786384f25e3efbe4786384f25e3
+	.octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\
+		0x0fc19dc68b8cd5b50fc19dc68b8cd5b5
+	.octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\
+		0x240ca1cc77ac9c65240ca1cc77ac9c65
+	.octa 0x2de92c6f592b02752de92c6f592b0275,\
+		0x2de92c6f592b02752de92c6f592b0275
+	.octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\
+		0x4a7484aa6ea6e4834a7484aa6ea6e483
+	.octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\
+		0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4
+	.octa 0x76f988da831153b576f988da831153b5,\
+		0x76f988da831153b576f988da831153b5
+	.octa 0x983e5152ee66dfab983e5152ee66dfab,\
+		0x983e5152ee66dfab983e5152ee66dfab
+	.octa 0xa831c66d2db43210a831c66d2db43210,\
+		0xa831c66d2db43210a831c66d2db43210
+	.octa 0xb00327c898fb213fb00327c898fb213f,\
+		0xb00327c898fb213fb00327c898fb213f
+	.octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\
+		0xbf597fc7beef0ee4bf597fc7beef0ee4
+	.octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\
+		0xc6e00bf33da88fc2c6e00bf33da88fc2
+	.octa 0xd5a79147930aa725d5a79147930aa725,\
+		0xd5a79147930aa725d5a79147930aa725
+	.octa 0x06ca6351e003826f06ca6351e003826f,\
+		0x06ca6351e003826f06ca6351e003826f
+	.octa 0x142929670a0e6e70142929670a0e6e70,\
+		0x142929670a0e6e70142929670a0e6e70
+	.octa 0x27b70a8546d22ffc27b70a8546d22ffc,\
+		0x27b70a8546d22ffc27b70a8546d22ffc
+	.octa 0x2e1b21385c26c9262e1b21385c26c926,\
+		0x2e1b21385c26c9262e1b21385c26c926
+	.octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\
+		0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed
+	.octa 0x53380d139d95b3df53380d139d95b3df,\
+		0x53380d139d95b3df53380d139d95b3df
+	.octa 0x650a73548baf63de650a73548baf63de,\
+		0x650a73548baf63de650a73548baf63de
+	.octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\
+		0x766a0abb3c77b2a8766a0abb3c77b2a8
+	.octa 0x81c2c92e47edaee681c2c92e47edaee6,\
+		0x81c2c92e47edaee681c2c92e47edaee6
+	.octa 0x92722c851482353b92722c851482353b,\
+		0x92722c851482353b92722c851482353b
+	.octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\
+		0xa2bfe8a14cf10364a2bfe8a14cf10364
+	.octa 0xa81a664bbc423001a81a664bbc423001,\
+		0xa81a664bbc423001a81a664bbc423001
+	.octa 0xc24b8b70d0f89791c24b8b70d0f89791,\
+		0xc24b8b70d0f89791c24b8b70d0f89791
+	.octa 0xc76c51a30654be30c76c51a30654be30,\
+		0xc76c51a30654be30c76c51a30654be30
+	.octa 0xd192e819d6ef5218d192e819d6ef5218,\
+		0xd192e819d6ef5218d192e819d6ef5218
+	.octa 0xd69906245565a910d69906245565a910,\
+		0xd69906245565a910d69906245565a910
+	.octa 0xf40e35855771202af40e35855771202a,\
+		0xf40e35855771202af40e35855771202a
+	.octa 0x106aa07032bbd1b8106aa07032bbd1b8,\
+		0x106aa07032bbd1b8106aa07032bbd1b8
+	.octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\
+		0x19a4c116b8d2d0c819a4c116b8d2d0c8
+	.octa 0x1e376c085141ab531e376c085141ab53,\
+		0x1e376c085141ab531e376c085141ab53
+	.octa 0x2748774cdf8eeb992748774cdf8eeb99,\
+		0x2748774cdf8eeb992748774cdf8eeb99
+	.octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\
+		0x34b0bcb5e19b48a834b0bcb5e19b48a8
+	.octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\
+		0x391c0cb3c5c95a63391c0cb3c5c95a63
+	.octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\
+		0x4ed8aa4ae3418acb4ed8aa4ae3418acb
+	.octa 0x5b9cca4f7763e3735b9cca4f7763e373,\
+		0x5b9cca4f7763e3735b9cca4f7763e373
+	.octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\
+		0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3
+	.octa 0x748f82ee5defb2fc748f82ee5defb2fc,\
+		0x748f82ee5defb2fc748f82ee5defb2fc
+	.octa 0x78a5636f43172f6078a5636f43172f60,\
+		0x78a5636f43172f6078a5636f43172f60
+	.octa 0x84c87814a1f0ab7284c87814a1f0ab72,\
+		0x84c87814a1f0ab7284c87814a1f0ab72
+	.octa 0x8cc702081a6439ec8cc702081a6439ec,\
+		0x8cc702081a6439ec8cc702081a6439ec
+	.octa 0x90befffa23631e2890befffa23631e28,\
+		0x90befffa23631e2890befffa23631e28
+	.octa 0xa4506cebde82bde9a4506cebde82bde9,\
+		0xa4506cebde82bde9a4506cebde82bde9
+	.octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\
+		0xbef9a3f7b2c67915bef9a3f7b2c67915
+	.octa 0xc67178f2e372532bc67178f2e372532b,\
+		0xc67178f2e372532bc67178f2e372532b
+	.octa 0xca273eceea26619cca273eceea26619c,\
+		0xca273eceea26619cca273eceea26619c
+	.octa 0xd186b8c721c0c207d186b8c721c0c207,\
+		0xd186b8c721c0c207d186b8c721c0c207
+	.octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\
+		0xeada7dd6cde0eb1eeada7dd6cde0eb1e
+	.octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\
+		0xf57d4f7fee6ed178f57d4f7fee6ed178
+	.octa 0x06f067aa72176fba06f067aa72176fba,\
+		0x06f067aa72176fba06f067aa72176fba
+	.octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\
+		0x0a637dc5a2c898a60a637dc5a2c898a6
+	.octa 0x113f9804bef90dae113f9804bef90dae,\
+		0x113f9804bef90dae113f9804bef90dae
+	.octa 0x1b710b35131c471b1b710b35131c471b,\
+		0x1b710b35131c471b1b710b35131c471b
+	.octa 0x28db77f523047d8428db77f523047d84,\
+		0x28db77f523047d8428db77f523047d84
+	.octa 0x32caab7b40c7249332caab7b40c72493,\
+		0x32caab7b40c7249332caab7b40c72493
+	.octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\
+		0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc
+	.octa 0x431d67c49c100d4c431d67c49c100d4c,\
+		0x431d67c49c100d4c431d67c49c100d4c
+	.octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\
+		0x4cc5d4becb3e42b64cc5d4becb3e42b6
+	.octa 0x597f299cfc657e2a597f299cfc657e2a,\
+		0x597f299cfc657e2a597f299cfc657e2a
+	.octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\
+		0x5fcb6fab3ad6faec5fcb6fab3ad6faec
+	.octa 0x6c44198c4a4758176c44198c4a475817,\
+		0x6c44198c4a4758176c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
+                         .octa 0x18191a1b1c1d1e1f1011121314151617
-- 
cgit v1.2.3


From 02fa472afef2f619a16eb24117b90f9e7998c65f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jun 2016 18:03:59 +0800
Subject: crypto: aesni - Use crypto_cipher to derive rfc4106 subkey

Currently aesni uses an async ctr(aes) to derive the rfc4106
subkey, which was presumably copied over from the generic rfc4106
code.  Over there it's done that way because we already have a
ctr(aes) spawn.  But it is simply overkill for aesni since we
have to go get a ctr(aes) from scratch anyway.

This patch simplifies the subkey derivation by using a straight
aes cipher instead.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 76 ++++++--------------------------------
 1 file changed, 11 insertions(+), 65 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 9e15572ef06d..0ab5ee1c26af 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -59,17 +59,6 @@ struct aesni_rfc4106_gcm_ctx {
 	u8 nonce[4];
 };
 
-struct aesni_gcm_set_hash_subkey_result {
-	int err;
-	struct completion completion;
-};
-
-struct aesni_hash_subkey_req_data {
-	u8 iv[16];
-	struct aesni_gcm_set_hash_subkey_result result;
-	struct scatterlist sg;
-};
-
 struct aesni_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
@@ -809,71 +798,28 @@ static void rfc4106_exit(struct crypto_aead *aead)
 	cryptd_free_aead(*ctx);
 }
 
-static void
-rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
-{
-	struct aesni_gcm_set_hash_subkey_result *result = req->data;
-
-	if (err == -EINPROGRESS)
-		return;
-	result->err = err;
-	complete(&result->completion);
-}
-
 static int
 rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 {
-	struct crypto_ablkcipher *ctr_tfm;
-	struct ablkcipher_request *req;
-	int ret = -EINVAL;
-	struct aesni_hash_subkey_req_data *req_data;
+	struct crypto_cipher *tfm;
+	int ret;
 
-	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
-	if (IS_ERR(ctr_tfm))
-		return PTR_ERR(ctr_tfm);
+	tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
 
-	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	ret = crypto_cipher_setkey(tfm, key, key_len);
 	if (ret)
-		goto out_free_ablkcipher;
-
-	ret = -ENOMEM;
-	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
-	if (!req)
-		goto out_free_ablkcipher;
-
-	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
-	if (!req_data)
-		goto out_free_request;
-
-	memset(req_data->iv, 0, sizeof(req_data->iv));
+		goto out_free_cipher;
 
 	/* Clear the data in the hash sub key container to zero.*/
 	/* We want to cipher all zeros to create the hash sub key. */
 	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
 
-	init_completion(&req_data->result.completion);
-	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
-	ablkcipher_request_set_tfm(req, ctr_tfm);
-	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-					CRYPTO_TFM_REQ_MAY_BACKLOG,
-					rfc4106_set_hash_subkey_done,
-					&req_data->result);
-
-	ablkcipher_request_set_crypt(req, &req_data->sg,
-		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
-
-	ret = crypto_ablkcipher_encrypt(req);
-	if (ret == -EINPROGRESS || ret == -EBUSY) {
-		ret = wait_for_completion_interruptible
-			(&req_data->result.completion);
-		if (!ret)
-			ret = req_data->result.err;
-	}
-	kfree(req_data);
-out_free_request:
-	ablkcipher_request_free(req);
-out_free_ablkcipher:
-	crypto_free_ablkcipher(ctr_tfm);
+	crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey);
+
+out_free_cipher:
+	crypto_free_cipher(tfm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From eb9bc8e7afaa9f062105dad55ec1c0663d961bb3 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 8 Jul 2016 09:28:03 -0700
Subject: crypto: sha-mb - Cleanup code to use || instead of |

 for condition comparison and cleanup multiline comment style

In sha*_ctx_mgr_submit, we currently use the | operator instead of ||
((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE))

Switching it to || and remove extraneous paranthesis to
adhere to coding style.

Also cleanup inconsistent multiline comment style.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1-mb/sha1_mb.c     |  2 +-
 arch/x86/crypto/sha256-mb/sha256_mb.c | 11 +++++++----
 arch/x86/crypto/sha512-mb/sha512_mb.c | 11 +++++++----
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
index 561b2867b878..9e5b67127a09 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -304,7 +304,7 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 	 * Or if the user's buffer contains less than a whole block,
 	 * append as much as possible to the extra block.
 	 */
-	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+	if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) {
 		/*
 		 * Compute how many bytes to copy from user buffer into
 		 * extra block
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
index c9d5dcc81c96..89fa85e8b10c 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb.c
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -283,7 +283,8 @@ static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
 	ctx->incoming_buffer = buffer;
 	ctx->incoming_buffer_length = len;
 
-	/* Store the user's request flags and mark this ctx as currently
+	/*
+	 * Store the user's request flags and mark this ctx as currently
 	 * being processed.
 	 */
 	ctx->status = (flags & HASH_LAST) ?
@@ -299,8 +300,9 @@ static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
 	 * Or if the user's buffer contains less than a whole block,
 	 * append as much as possible to the extra block.
 	 */
-	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
-		/* Compute how many bytes to copy from user buffer into
+	if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) {
+		/*
+		 * Compute how many bytes to copy from user buffer into
 		 * extra block
 		 */
 		uint32_t copy_len = SHA256_BLOCK_SIZE -
@@ -323,7 +325,8 @@ static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
 		/* The extra block should never contain more than 1 block */
 		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
 
-		/* If the extra block buffer contains exactly 1 block,
+		/*
+		 * If the extra block buffer contains exactly 1 block,
 		 * it can be hashed.
 		 */
 		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index 676f0f272f2b..f4cf5b78fd36 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -253,7 +253,8 @@ static struct sha512_hash_ctx
 					  int flags)
 {
 	if (flags & (~HASH_ENTIRE)) {
-		/* User should not pass anything other than FIRST, UPDATE, or
+		/*
+		 * User should not pass anything other than FIRST, UPDATE, or
 		 * LAST
 		 */
 		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
@@ -284,7 +285,8 @@ static struct sha512_hash_ctx
 		ctx->partial_block_buffer_length = 0;
 	}
 
-	/* If we made it here, there were no errors during this call to
+	/*
+	 * If we made it here, there were no errors during this call to
 	 * submit
 	 */
 	ctx->error = HASH_CTX_ERROR_NONE;
@@ -293,7 +295,8 @@ static struct sha512_hash_ctx
 	ctx->incoming_buffer = buffer;
 	ctx->incoming_buffer_length = len;
 
-	/* Store the user's request flags and mark this ctx as currently being
+	/*
+	 * Store the user's request flags and mark this ctx as currently being
 	 * processed.
 	 */
 	ctx->status = (flags & HASH_LAST) ?
@@ -309,7 +312,7 @@ static struct sha512_hash_ctx
 	 * Or if the user's buffer contains less than a whole block,
 	 * append as much as possible to the extra block.
 	 */
-	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+	if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) {
 		/* Compute how many bytes to copy from user buffer into extra
 		 * block
 		 */
-- 
cgit v1.2.3