diff options
Diffstat (limited to 'arch')
32 files changed, 392 insertions, 161 deletions
diff --git a/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi b/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi index f3404dd10537..cf628465cd0a 100644 --- a/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi +++ b/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi @@ -230,6 +230,8 @@ accelerometer@1c { compatible = "fsl,mma8451"; reg = <0x1c>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_mma8451_int>; interrupt-parent = <&gpio6>; interrupts = <31 IRQ_TYPE_LEVEL_LOW>; }; @@ -628,6 +630,12 @@ >; }; + pinctrl_mma8451_int: mma8451intgrp { + fsl,pins = < + MX6QDL_PAD_EIM_BCLK__GPIO6_IO31 0xb0b1 + >; + }; + pinctrl_pwm3: pwm1grp { fsl,pins = < MX6QDL_PAD_SD4_DAT1__PWM3_OUT 0x1b0b1 diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi index 0c4e6ebc3529..31556bea2c93 100644 --- a/arch/arm/boot/dts/stm32mp157c.dtsi +++ b/arch/arm/boot/dts/stm32mp157c.dtsi @@ -914,7 +914,7 @@ interrupt-names = "int0", "int1"; clocks = <&rcc CK_HSE>, <&rcc FDCAN_K>; clock-names = "hclk", "cclk"; - bosch,mram-cfg = <0x1400 0 0 32 0 0 2 2>; + bosch,mram-cfg = <0x0 0 0 32 0 0 2 2>; status = "disabled"; }; @@ -927,7 +927,7 @@ interrupt-names = "int0", "int1"; clocks = <&rcc CK_HSE>, <&rcc FDCAN_K>; clock-names = "hclk", "cclk"; - bosch,mram-cfg = <0x0 0 0 32 0 0 2 2>; + bosch,mram-cfg = <0x1400 0 0 32 0 0 2 2>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts index 568b90ece342..3bec3e0a81b2 100644 --- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts +++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts @@ -192,6 +192,7 @@ vqmmc-supply = <®_dldo1>; non-removable; wakeup-source; + keep-power-in-suspend; status = "okay"; brcmf: wifi@1 { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index d5e0b908f0ba..25da9b2d9610 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1197,6 +1197,9 @@ void __init adjust_lowmem_bounds(void) phys_addr_t block_start = reg->base; phys_addr_t block_end = reg->base + reg->size; + if (memblock_is_nomap(reg)) + continue; + if (reg->base < vmalloc_limit) { if (block_end > lowmem_limit) /* diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts index de6ef39f3118..fce9343dc017 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts @@ -99,7 +99,7 @@ status = "okay"; i2c-mux@77 { - compatible = "nxp,pca9847"; + compatible = "nxp,pca9547"; reg = <0x77>; #address-cells = <1>; #size-cells = <0>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index 0d0a6543e5db..a9824b862c41 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -370,7 +370,7 @@ }; sdma2: dma-controller@302c0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x302c0000 0x10000>; interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MM_CLK_SDMA2_ROOT>, @@ -381,7 +381,7 @@ }; sdma3: dma-controller@302b0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x302b0000 0x10000>; interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MM_CLK_SDMA3_ROOT>, @@ -693,7 +693,7 @@ }; sdma1: dma-controller@30bd0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x30bd0000 0x10000>; interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MM_CLK_SDMA1_ROOT>, diff --git a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi index 3faa652fdf20..c25be32ba37e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi @@ -100,7 +100,7 @@ regulator-name = "0V9_ARM"; regulator-min-microvolt = <900000>; regulator-max-microvolt = <1000000>; - gpios = <&gpio3 19 GPIO_ACTIVE_HIGH>; + gpios = <&gpio3 16 GPIO_ACTIVE_HIGH>; states = <1000000 0x1 900000 0x0>; regulator-always-on; diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 10415572e82f..322b55664cca 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -48,5 +48,6 @@ EXPORT_SYMBOL(__arch_clear_user) .section .fixup,"ax" .align 2 9: mov x0, x2 // return the original size + uaccess_disable_not_uao x2, x3 ret .previous diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 680e74409ff9..8472dc7798b3 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,5 +66,6 @@ EXPORT_SYMBOL(__arch_copy_from_user) .section .fixup,"ax" .align 2 9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 ret .previous diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 0bedae3f3792..8e0355c1e318 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,5 +68,6 @@ EXPORT_SYMBOL(__arch_copy_in_user) .section .fixup,"ax" .align 2 9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 ret .previous diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 2d88c736e8f2..6085214654dc 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,5 +65,6 @@ EXPORT_SYMBOL(__arch_copy_to_user) .section .fixup,"ax" .align 2 9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 ret .previous diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index ec1c97a8e8cb..baaafc9b9d88 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -140,9 +140,12 @@ void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); /* Patch sites */ extern s32 patch__call_flush_count_cache; extern s32 patch__flush_count_cache_return; +extern s32 patch__flush_link_stack_return; +extern s32 patch__call_kvm_flush_link_stack; extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; +extern long kvm_flush_link_stack; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h index 759597bf0fd8..ccf44c135389 100644 --- a/arch/powerpc/include/asm/security_features.h +++ b/arch/powerpc/include/asm/security_features.h @@ -81,6 +81,9 @@ static inline bool security_ftr_enabled(unsigned long feature) // Software required to flush count cache on context switch #define SEC_FTR_FLUSH_COUNT_CACHE 0x0000000000000400ull +// Software required to flush link stack on context switch +#define SEC_FTR_FLUSH_LINK_STACK 0x0000000000001000ull + // Features enabled by default #define SEC_FTR_DEFAULT \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0a0b5310f54a..81d61770f9c2 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -546,6 +546,7 @@ flush_count_cache: /* Save LR into r9 */ mflr r9 + // Flush the link stack .rept 64 bl .+4 .endr @@ -555,6 +556,11 @@ flush_count_cache: .balign 32 /* Restore LR */ 1: mtlr r9 + + // If we're just flushing the link stack, return here +3: nop + patch_site 3b patch__flush_link_stack_return + li r9,0x7fff mtctr r9 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e1c9cf079503..bd91dceb7010 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -24,11 +24,12 @@ enum count_cache_flush_type { COUNT_CACHE_FLUSH_HW = 0x4, }; static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; +static bool link_stack_flush_enabled; bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -114,7 +115,7 @@ static __init int security_feature_debugfs_init(void) device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -122,6 +123,9 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); +#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E void setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) @@ -209,11 +213,19 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c if (ccd) seq_buf_printf(&s, "Indirect branch cache disabled"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { seq_buf_printf(&s, "Mitigation: Software count cache flush"); if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW) seq_buf_printf(&s, " (hardware accelerated)"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (btb_flush_enabled) { seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); } else { @@ -374,18 +386,49 @@ static __init int stf_barrier_debugfs_init(void) device_initcall(stf_barrier_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +static void no_count_cache_flush(void) +{ + count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; + pr_info("count-cache-flush: software flush disabled.\n"); +} + static void toggle_count_cache_flush(bool enable) { - if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE) && + !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK)) + enable = false; + + if (!enable) { patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP); - count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; - pr_info("count-cache-flush: software flush disabled.\n"); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + patch_instruction_site(&patch__call_kvm_flush_link_stack, PPC_INST_NOP); +#endif + pr_info("link-stack-flush: software flush disabled.\n"); + link_stack_flush_enabled = false; + no_count_cache_flush(); return; } + // This enables the branch from _switch to flush_count_cache patch_branch_site(&patch__call_flush_count_cache, (u64)&flush_count_cache, BRANCH_SET_LINK); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + // This enables the branch from guest_exit_cont to kvm_flush_link_stack + patch_branch_site(&patch__call_kvm_flush_link_stack, + (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); +#endif + + pr_info("link-stack-flush: software flush enabled.\n"); + link_stack_flush_enabled = true; + + // If we just need to flush the link stack, patch an early return + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + patch_instruction_site(&patch__flush_link_stack_return, PPC_INST_BLR); + no_count_cache_flush(); + return; + } + if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) { count_cache_flush_type = COUNT_CACHE_FLUSH_SW; pr_info("count-cache-flush: full software flush sequence enabled.\n"); @@ -399,7 +442,26 @@ static void toggle_count_cache_flush(bool enable) void setup_count_cache_flush(void) { - toggle_count_cache_flush(true); + bool enable = true; + + if (no_spectrev2 || cpu_mitigations_off()) { + if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) || + security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED)) + pr_warn("Spectre v2 mitigations not fully under software control, can't disable\n"); + + enable = false; + } + + /* + * There's no firmware feature flag/hypervisor bit to tell us we need to + * flush the link stack on context switch. So we set it here if we see + * either of the Spectre v2 mitigations that aim to protect userspace. + */ + if (security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED) || + security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) + security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); + + toggle_count_cache_flush(enable); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 07181d0dfcb7..0ba1d7abb798 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -11,6 +11,7 @@ */ #include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> #include <asm/kvm_asm.h> #include <asm/reg.h> #include <asm/mmu.h> @@ -1458,6 +1459,13 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1: #endif /* CONFIG_KVM_XICS */ + /* + * Possibly flush the link stack here, before we do a blr in + * guest_exit_short_path. + */ +1: nop + patch_site 1b patch__call_kvm_flush_link_stack + /* If we came in through the P9 short path, go back out to C now */ lwz r0, STACK_SLOT_SHORT_PATH(r1) cmpwi r0, 0 @@ -1933,6 +1941,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +.balign 32 +.global kvm_flush_link_stack +kvm_flush_link_stack: + /* Save LR into r0 */ + mflr r0 + + /* Flush the link stack. On Power8 it's up to 32 entries in size. */ + .rept 32 + bl .+4 + .endr + + /* And on Power9 it's up to 64. */ +BEGIN_FTR_SECTION + .rept 32 + bl .+4 + .endr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + + /* Restore LR */ + mtlr r0 + blr + kvmppc_guest_external: /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 02a59946a78a..be3517ef0574 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1142,6 +1142,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) } /* + * If we have seen a tail call, we need a second pass. + * This is because bpf_jit_emit_common_epilogue() is called + * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen. + */ + if (cgctx.seen & SEEN_TAILCALL) { + cgctx.idx = 0; + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { + fp = org_fp; + goto out_addrs; + } + } + + /* * Pretend to build prologue, given the features we've seen. This will * update ctgtx.idx as it pretends to output instructions, then we can * calculate total size from idx. diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4f86928246e7..1153e510cedd 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -172,7 +172,7 @@ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI .if \no_user_check == 0 /* coming from usermode? */ - testl $SEGMENT_RPL_MASK, PT_CS(%esp) + testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp) jz .Lend_\@ .endif /* On user-cr3? */ @@ -205,64 +205,76 @@ #define CS_FROM_ENTRY_STACK (1 << 31) #define CS_FROM_USER_CR3 (1 << 30) #define CS_FROM_KERNEL (1 << 29) +#define CS_FROM_ESPFIX (1 << 28) .macro FIXUP_FRAME /* * The high bits of the CS dword (__csh) are used for CS_FROM_*. * Clear them in case hardware didn't do this for us. */ - andl $0x0000ffff, 3*4(%esp) + andl $0x0000ffff, 4*4(%esp) #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, 4*4(%esp) + testl $X86_EFLAGS_VM, 5*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ #endif - testl $SEGMENT_RPL_MASK, 3*4(%esp) + testl $USER_SEGMENT_RPL_MASK, 4*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ - orl $CS_FROM_KERNEL, 3*4(%esp) + orl $CS_FROM_KERNEL, 4*4(%esp) /* * When we're here from kernel mode; the (exception) stack looks like: * - * 5*4(%esp) - <previous context> - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 6*4(%esp) - <previous context> + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 5 enties as gap: + * the original 6 enties as gap: * - * 12*4(%esp) - <previous context> - * 11*4(%esp) - gap / flags - * 10*4(%esp) - gap / cs - * 9*4(%esp) - gap / ip - * 8*4(%esp) - gap / orig_eax - * 7*4(%esp) - gap / gs / function - * 6*4(%esp) - ss - * 5*4(%esp) - sp - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 14*4(%esp) - <previous context> + * 13*4(%esp) - gap / flags + * 12*4(%esp) - gap / cs + * 11*4(%esp) - gap / ip + * 10*4(%esp) - gap / orig_eax + * 9*4(%esp) - gap / gs / function + * 8*4(%esp) - gap / fs + * 7*4(%esp) - ss + * 6*4(%esp) - sp + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs */ pushl %ss # ss pushl %esp # sp (points at ss) - addl $6*4, (%esp) # point sp back at the previous context - pushl 6*4(%esp) # flags - pushl 6*4(%esp) # cs - pushl 6*4(%esp) # ip - pushl 6*4(%esp) # orig_eax - pushl 6*4(%esp) # gs / function + addl $7*4, (%esp) # point sp back at the previous context + pushl 7*4(%esp) # flags + pushl 7*4(%esp) # cs + pushl 7*4(%esp) # ip + pushl 7*4(%esp) # orig_eax + pushl 7*4(%esp) # gs / function + pushl 7*4(%esp) # fs .Lfrom_usermode_no_fixup_\@: .endm .macro IRET_FRAME + /* + * We're called with %ds, %es, %fs, and %gs from the interrupted + * frame, so we shouldn't use them. Also, we may be in ESPFIX + * mode and therefore have a nonzero SS base and an offset ESP, + * so any attempt to access the stack needs to use SS. (except for + * accesses through %esp, which automatically use SS.) + */ testl $CS_FROM_KERNEL, 1*4(%esp) jz .Lfinished_frame_\@ @@ -276,31 +288,40 @@ movl 5*4(%esp), %eax # (modified) regs->sp movl 4*4(%esp), %ecx # flags - movl %ecx, -4(%eax) + movl %ecx, %ss:-1*4(%eax) movl 3*4(%esp), %ecx # cs andl $0x0000ffff, %ecx - movl %ecx, -8(%eax) + movl %ecx, %ss:-2*4(%eax) movl 2*4(%esp), %ecx # ip - movl %ecx, -12(%eax) + movl %ecx, %ss:-3*4(%eax) movl 1*4(%esp), %ecx # eax - movl %ecx, -16(%eax) + movl %ecx, %ss:-4*4(%eax) popl %ecx - lea -16(%eax), %esp + lea -4*4(%eax), %esp popl %eax .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0 cld .if \skip_gs == 0 PUSH_GS .endif - FIXUP_FRAME pushl %fs + + pushl %eax + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs +.if \unwind_espfix > 0 + UNWIND_ESPFIX_STACK +.endif + popl %eax + + FIXUP_FRAME pushl %es pushl %ds pushl \pt_regs_ax @@ -313,8 +334,6 @@ movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs .if \skip_gs == 0 SET_KERNEL_GS %edx .endif @@ -324,8 +343,8 @@ .endif .endm -.macro SAVE_ALL_NMI cr3_reg:req - SAVE_ALL +.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0 + SAVE_ALL unwind_espfix=\unwind_espfix BUG_IF_WRONG_CR3 @@ -357,6 +376,7 @@ 2: popl %es 3: popl %fs POP_GS \pop + IRET_FRAME .pushsection .fixup, "ax" 4: movl $0, (%esp) jmp 1b @@ -395,7 +415,8 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -1075,7 +1096,6 @@ restore_all: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: - IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1128,30 +1148,43 @@ ENDPROC(entry_INT80_32) * We can't call C functions using the ESPFIX stack. This code reads * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. + * + * We might be on user CR3 here, so percpu data is not mapped and we can't + * access the GDT through the percpu segment. Instead, use SGDT to find + * the cpu_entry_area alias of the GDT. */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + pushl %ecx + subl $2*4, %esp + sgdt (%esp) + movl 2(%esp), %ecx /* GDT address */ + /* + * Careful: ECX is a linear pointer, so we need to force base + * zero. %cs is the only known-linear segment we have right now. + */ + mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */ + mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */ shl $16, %eax + addl $2*4, %esp + popl %ecx addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm + .macro UNWIND_ESPFIX_STACK + /* It's safe to clobber %eax, all other regs need to be preserved */ #ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + jne .Lno_fixup_\@ /* switch to normal stack */ FIXUP_ESPFIX_STACK -27: +.Lno_fixup_\@: #endif .endm @@ -1341,11 +1374,6 @@ END(spurious_interrupt_bug) #ifdef CONFIG_XEN_PV ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1353,16 +1381,17 @@ ENTRY(xen_hypervisor_callback) * iret instruction's behaviour where it delivers a * pending interrupt when enabling interrupts: */ - movl PT_EIP(%esp), %eax - cmpl $xen_iret_start_crit, %eax + cmpl $xen_iret_start_crit, (%esp) jb 1f - cmpl $xen_iret_end_crit, %eax + cmpl $xen_iret_end_crit, (%esp) jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax + call xen_iret_crit_fixup +1: + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + mov %esp, %eax call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall @@ -1449,10 +1478,9 @@ END(page_fault) common_exception_read_cr2: /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1474,9 +1502,8 @@ END(common_exception_read_cr2) common_exception: /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1515,6 +1542,10 @@ ENTRY(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 + /* + * ESPFIX_SS is only ever set on the return to user path + * after we've switched to the entry stack. + */ pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1550,6 +1581,11 @@ ENTRY(nmi) movl %ebx, %esp .Lnmi_return: +#ifdef CONFIG_X86_ESPFIX32 + testl $CS_FROM_ESPFIX, PT_CS(%esp) + jnz .Lnmi_from_espfix +#endif + CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 jmp .Lirq_return @@ -1557,23 +1593,42 @@ ENTRY(nmi) #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: /* - * create the pointer to lss back + * Create the pointer to LSS back */ pushl %ss pushl %esp addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL_NMI cr3_reg=%edi + + /* Copy the (short) IRET frame */ + pushl 4*4(%esp) # flags + pushl 4*4(%esp) # cs + pushl 4*4(%esp) # ip + + pushl %eax # orig_ax + + SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1 ENCODE_FRAME_POINTER - FIXUP_ESPFIX_STACK # %eax == %esp + + /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */ + xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp) + xorl %edx, %edx # zero error code - call do_nmi + movl %esp, %eax # pt_regs pointer + jmp .Lnmi_from_sysenter_stack + +.Lnmi_from_espfix: RESTORE_ALL_NMI cr3_reg=%edi - lss 12+4(%esp), %esp # back to espfix stack + /* + * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to + * fix up the gap and long frame: + * + * 3 - original frame (exception) + * 2 - ESPFIX block (above) + * 6 - gap (FIXUP_FRAME) + * 5 - long frame (FIXUP_FRAME) + * 1 - orig_ax + */ + lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif END(nmi) diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index cff3f3f3bfe0..6e9c9af3255a 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -78,8 +78,12 @@ struct cpu_entry_area { /* * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. + * a read-only guard page. On 32-bit the GDT must be writeable, so + * it needs an extra guard page. */ +#ifdef CONFIG_X86_32 + char guard_entry_stack[PAGE_SIZE]; +#endif struct entry_stack_page entry_stack_page; /* @@ -94,7 +98,6 @@ struct cpu_entry_area { */ struct cea_exception_stacks estacks; #endif -#ifdef CONFIG_CPU_SUP_INTEL /* * Per CPU debug store for Intel performance monitoring. Wastes a * full page at the moment. @@ -105,11 +108,13 @@ struct cpu_entry_area { * Reserve enough fixmap PTEs. */ struct debug_store_buffers cpu_debug_buffers; -#endif }; -#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +/* Total size includes the readonly IDT mapping page as well: */ +#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); @@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); +/* Single page reserved for the readonly IDT mapping: */ #define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE #define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) #define CPU_ENTRY_AREA_MAP_SIZE \ - (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) extern struct cpu_entry_area *get_cpu_entry_area(int cpu); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4c95c365058a..44c48e34d799 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } /* diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index b0bc0fff5f1f..1636eb8e5a5b 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * to avoid include recursion hell */ -#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 39) -#define CPU_ENTRY_AREA_BASE \ - ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ - & PMD_MASK) +/* The +1 is for the readonly IDT page: */ +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK) #define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index ac3892920419..6669164abadc 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -31,6 +31,18 @@ */ #define SEGMENT_RPL_MASK 0x3 +/* + * When running on Xen PV, the actual privilege level of the kernel is 1, + * not 0. Testing the Requested Privilege Level in a segment selector to + * determine whether the context is user mode or kernel mode with + * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level + * matches the 0x3 mask. + * + * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV + * kernels because privilege level 2 is never used. + */ +#define USER_SEGMENT_RPL_MASK 0x2 + /* User mode is privilege level 3: */ #define USER_RPL 0x3 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9b7586204cd2..cc5b535d2448 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ @@ -108,6 +109,12 @@ void __init check_bugs(void) mds_select_mitigation(); taa_select_mitigation(); + /* + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. + */ + mds_print_mitigation(); + arch_smt_update(); #ifdef CONFIG_X86_32 @@ -245,6 +252,12 @@ static void __init mds_select_mitigation(void) (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -304,8 +317,12 @@ static void __init taa_select_mitigation(void) return; } - /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ - if (taa_mitigation == TAA_MITIGATION_OFF) + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) goto out; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) @@ -339,6 +356,15 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } out: pr_info("%s\n", taa_strings[taa_mitigation]); } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0b8cedb20d6d..d5c9b13bafdf 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif .__cr3 = __pa_nodebug(swapper_pg_dir), }; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6c4f01540833..43abebc2fc77 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -709,6 +709,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30f9cb2c0b55..2e6a0676c1f4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -571,6 +571,16 @@ ENTRY(initial_page_table) # error "Kernel PMDs should be 1, 2 or 3" # endif .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + #endif .data diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57d87f79558f..04dd3cc6c6ed 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 32b1c6136c6a..2812e5c4ab7b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3352,7 +3352,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -5961,9 +5961,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 752ad11d6868..d9643647a9ce 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -178,7 +178,9 @@ static __init void setup_cpu_entry_area_ptes(void) #ifdef CONFIG_X86_32 unsigned long start, end; - BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + /* The +1 is for the readonly IDT: */ + BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); + BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index b02a36b2c14f..a42015b305f4 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -69,7 +69,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" + lprefix3_expr = "\\((F2|!F3|66&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -257,7 +257,7 @@ function convert_operands(count,opnd, i,j,imm,mod) return add_flags(imm, mod) } -/^[0-9a-f]+\:/ { +/^[0-9a-f]+:/ { if (NR == 1) next # get index diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index c15db060a242..cd177772fe4d 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -126,10 +126,9 @@ hyper_iret: .globl xen_iret_start_crit, xen_iret_end_crit /* - * This is called by xen_hypervisor_callback in entry.S when it sees + * This is called by xen_hypervisor_callback in entry_32.S when it sees * that the EIP at the time of interrupt was between - * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in - * %eax so we can do a more refined determination of what to do. + * xen_iret_start_crit and xen_iret_end_crit. * * The stack format at this point is: * ---------------- @@ -138,70 +137,46 @@ hyper_iret: * eflags } outer exception info * cs } * eip } - * ---------------- <- edi (copy dest) - * eax : outer eax if it hasn't been restored * ---------------- - * eflags } nested exception info - * cs } (no ss/esp because we're nested - * eip } from the same ring) - * orig_eax }<- esi (copy src) - * - - - - - - - - - * fs } - * es } - * ds } SAVE_ALL state - * eax } - * : : - * ebx }<- esp + * eax : outer eax if it hasn't been restored * ---------------- + * eflags } + * cs } nested exception info + * eip } + * return address : (into xen_hypervisor_callback) * - * In order to deliver the nested exception properly, we need to shift - * everything from the return addr up to the error code so it sits - * just under the outer exception info. This means that when we - * handle the exception, we do it in the context of the outer - * exception rather than starting a new one. + * In order to deliver the nested exception properly, we need to discard the + * nested exception frame such that when we handle the exception, we do it + * in the context of the outer exception rather than starting a new one. * - * The only caveat is that if the outer eax hasn't been restored yet - * (ie, it's still on stack), we need to insert its value into the - * SAVE_ALL state before going on, since it's usermode state which we - * eventually need to restore. + * The only caveat is that if the outer eax hasn't been restored yet (i.e. + * it's still on stack), we need to restore its value here. */ ENTRY(xen_iret_crit_fixup) /* * Paranoia: Make sure we're really coming from kernel space. * One could imagine a case where userspace jumps into the * critical range address, but just before the CPU delivers a - * GP, it decides to deliver an interrupt instead. Unlikely? - * Definitely. Easy to avoid? Yes. The Intel documents - * explicitly say that the reported EIP for a bad jump is the - * jump instruction itself, not the destination, but some - * virtual environments get this wrong. + * PF, it decides to deliver an interrupt instead. Unlikely? + * Definitely. Easy to avoid? Yes. */ - movl PT_CS(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX(%esp), %esi - lea PT_EFLAGS(%esp), %edi + testb $2, 2*4(%esp) /* nested CS */ + jnz 2f /* * If eip is before iret_restore_end then stack * hasn't been restored yet. */ - cmp $iret_restore_end, %eax + cmpl $iret_restore_end, 1*4(%esp) jae 1f - movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ - movl %eax, PT_EAX(%esp) + movl 4*4(%esp), %eax /* load outer EAX */ + ret $4*4 /* discard nested EIP, CS, and EFLAGS as + * well as the just restored EAX */ - lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi), %esp /* point esp to new frame */ -2: jmp xen_do_upcall +1: + ret $3*4 /* discard nested EIP, CS, and EFLAGS */ +2: + ret +END(xen_iret_crit_fixup) |