// SPDX-License-Identifier: GPL-2.0
/*
 * ChaCha and HChaCha functions (ARM optimized)
 *
 * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2015 Martin Willi
 */

#include <crypto/chacha.h>
#include <crypto/internal/simd.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>

#include <asm/cputype.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>

asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
				      u8 *dst, const u8 *src, int nrounds);
asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
				       u8 *dst, const u8 *src,
				       int nrounds, unsigned int nbytes);
asmlinkage void hchacha_block_arm(const struct chacha_state *state,
				  u32 out[HCHACHA_OUT_WORDS], int nrounds);
asmlinkage void hchacha_block_neon(const struct chacha_state *state,
				   u32 out[HCHACHA_OUT_WORDS], int nrounds);

asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
			     const struct chacha_state *state, int nrounds);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);

static inline bool neon_usable(void)
{
	return static_branch_likely(&use_neon) && crypto_simd_usable();
}

static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
			  unsigned int bytes, int nrounds)
{
	u8 buf[CHACHA_BLOCK_SIZE];

	while (bytes > CHACHA_BLOCK_SIZE) {
		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);

		chacha_4block_xor_neon(state, dst, src, nrounds, l);
		bytes -= l;
		src += l;
		dst += l;
		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
	}
	if (bytes) {
		const u8 *s = src;
		u8 *d = dst;

		if (bytes != CHACHA_BLOCK_SIZE)
			s = d = memcpy(buf, src, bytes);
		chacha_block_xor_neon(state, d, s, nrounds);
		if (d != dst)
			memcpy(dst, buf, bytes);
		state->x[12]++;
	}
}

void hchacha_block_arch(const struct chacha_state *state,
			u32 out[HCHACHA_OUT_WORDS], int nrounds)
{
	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
		hchacha_block_arm(state, out, nrounds);
	} else {
		kernel_neon_begin();
		hchacha_block_neon(state, out, nrounds);
		kernel_neon_end();
	}
}
EXPORT_SYMBOL(hchacha_block_arch);

void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
		       unsigned int bytes, int nrounds)
{
	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
	    bytes <= CHACHA_BLOCK_SIZE) {
		chacha_doarm(dst, src, bytes, state, nrounds);
		state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
		return;
	}

	do {
		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);

		kernel_neon_begin();
		chacha_doneon(state, dst, src, todo, nrounds);
		kernel_neon_end();

		bytes -= todo;
		src += todo;
		dst += todo;
	} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);

bool chacha_is_arch_optimized(void)
{
	/* We always can use at least the ARM scalar implementation. */
	return true;
}
EXPORT_SYMBOL(chacha_is_arch_optimized);

static int __init chacha_arm_mod_init(void)
{
	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
		switch (read_cpuid_part()) {
		case ARM_CPU_PART_CORTEX_A7:
		case ARM_CPU_PART_CORTEX_A5:
			/*
			 * The Cortex-A7 and Cortex-A5 do not perform well with
			 * the NEON implementation but do incredibly with the
			 * scalar one and use less power.
			 */
			break;
		default:
			static_branch_enable(&use_neon);
		}
	}
	return 0;
}
subsys_initcall(chacha_arm_mod_init);

static void __exit chacha_arm_mod_exit(void)
{
}
module_exit(chacha_arm_mod_exit);

MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");