summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLisa Robinson <lisa@bytefly.space>2026-04-22 10:45:11 +0300
committerHuacai Chen <chenhuacai@loongson.cn>2026-04-22 10:45:11 +0300
commite3f4591f7920ce169f2f78fa5a89639ada7d7058 (patch)
treee359772df8ec1ca32c159ccb91a87aaaddcc1d5a
parent1829419bc3b291ad9547abe70053c2620832ac41 (diff)
downloadlinux-e3f4591f7920ce169f2f78fa5a89639ada7d7058.tar.xz
LoongArch: Align FPU register state to 32 bytes
Move fpr to the beginning of struct loongarch_fpu so it is naturally aligned to FPU_ALIGN (32 bytes), improving 256-bit SIMD (LASX) context switch performance. Also adjust process.c and fpu.S to work well with the new loongarch_fpu layout. Signed-off-by: Lisa Robinson <lisa@bytefly.space> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
-rw-r--r--arch/loongarch/include/asm/processor.h2
-rw-r--r--arch/loongarch/kernel/fpu.S12
-rw-r--r--arch/loongarch/kernel/process.c2
3 files changed, 9 insertions, 7 deletions
diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
index c3bc44b5f5b3..ce8b953f8c79 100644
--- a/arch/loongarch/include/asm/processor.h
+++ b/arch/loongarch/include/asm/processor.h
@@ -80,10 +80,10 @@ BUILD_FPR_ACCESS(32)
BUILD_FPR_ACCESS(64)
struct loongarch_fpu {
+ union fpureg fpr[NUM_FPU_REGS];
uint64_t fcc; /* 8x8 */
uint32_t fcsr;
uint32_t ftop;
- union fpureg fpr[NUM_FPU_REGS];
};
struct loongarch_lbt {
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index f225dcc5b530..bf7d6b8bf600 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -97,7 +97,7 @@
.endm
#ifdef CONFIG_32BIT
- .macro sc_save_fcc thread tmp0 tmp1
+ .macro sc_save_fcc base tmp0 tmp1
movcf2gr \tmp0, $fcc0
move \tmp1, \tmp0
movcf2gr \tmp0, $fcc1
@@ -106,7 +106,7 @@
bstrins.w \tmp1, \tmp0, 23, 16
movcf2gr \tmp0, $fcc3
bstrins.w \tmp1, \tmp0, 31, 24
- EX st.w \tmp1, \thread, THREAD_FCC
+ EX st.w \tmp1, \base, 0
movcf2gr \tmp0, $fcc4
move \tmp1, \tmp0
movcf2gr \tmp0, $fcc5
@@ -115,11 +115,11 @@
bstrins.w \tmp1, \tmp0, 23, 16
movcf2gr \tmp0, $fcc7
bstrins.w \tmp1, \tmp0, 31, 24
- EX st.w \tmp1, \thread, (THREAD_FCC + 4)
+ EX st.w \tmp1, \base, 4
.endm
- .macro sc_restore_fcc thread tmp0 tmp1
- EX ld.w \tmp0, \thread, THREAD_FCC
+ .macro sc_restore_fcc base tmp0 tmp1
+ EX ld.w \tmp0, \base, 0
bstrpick.w \tmp1, \tmp0, 7, 0
movgr2cf $fcc0, \tmp1
bstrpick.w \tmp1, \tmp0, 15, 8
@@ -128,7 +128,7 @@
movgr2cf $fcc2, \tmp1
bstrpick.w \tmp1, \tmp0, 31, 24
movgr2cf $fcc3, \tmp1
- EX ld.w \tmp0, \thread, (THREAD_FCC + 4)
+ EX ld.w \tmp0, \base, 4
bstrpick.w \tmp1, \tmp0, 7, 0
movgr2cf $fcc4, \tmp1
bstrpick.w \tmp1, \tmp0, 15, 8
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index 4ac1c3086152..17e88eedb154 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -135,6 +135,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
return 0;
}
+ dst->thread.fpu.fcsr = src->thread.fpu.fcsr;
+
if (!used_math())
memcpy(dst, src, offsetof(struct task_struct, thread.fpu.fpr));
else