diff options
Diffstat (limited to 'arch')
55 files changed, 1522 insertions, 2436 deletions
diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h new file mode 100644 index 000000000000..7ffa5437b761 --- /dev/null +++ b/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..f65a804b86f0 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ - obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 323cb065be5e..9c95aa417e9b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -73,7 +73,6 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -158,6 +157,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -168,7 +168,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -778,8 +778,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index cd20ca0b4043..71a48a30fc84 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..af28a8a24366 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..640aafebdc00 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,5 @@ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6d078b89a5e8..4dbb336a1fdd 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +44,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -316,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -351,6 +353,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +370,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -406,6 +410,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,6 +418,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -440,13 +446,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -485,17 +580,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +600,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -561,6 +644,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -633,6 +717,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +739,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -740,6 +826,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" @@ -763,6 +851,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -860,6 +949,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +958,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,14 +983,12 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 @@ -923,14 +1012,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1040,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -970,11 +1060,13 @@ ENTRY(xen_failsafe_callback) pushq $0 /* RIP */ pushq %r11 pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -1020,6 +1112,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1047,6 +1140,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1068,6 +1162,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1152,6 +1247,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1161,6 +1257,7 @@ END(error_exit) /* Runs on exception stack */ ENTRY(nmi) + UNWIND_HINT_IRET_REGS /* * Fix up the exception frame if we're on Xen. * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most @@ -1234,11 +1331,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,6 +1354,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1409,6 +1509,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1424,6 +1525,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,6 +1575,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1510,17 +1613,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..5314d7b8e5ad 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..e0bb46c02857 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9aeb91935ce0..bda9f94bcb10 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 48febf07e828..1310e1f1cd65 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 028245e1c42b..abc99b9c7ffd 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -661,7 +662,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -684,6 +685,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -693,6 +695,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a01892bdd61a..287eac7d207f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 237e9c2341c7..70e48aa6af98 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..4f0481474903 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..225af4184f06 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..29da9599fec0 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -457,12 +455,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +474,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,10 +487,10 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a870910c8565..f0e64db18ac8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -21,6 +21,25 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *__mm) { @@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) pc = &mm->context; set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..62e7d70aadd5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c6d6dc5f8bb2..efc5eeb58292 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c3169be4c596..c85269a76511 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d0498800..ecab32282f0f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -115,6 +115,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -1310,6 +1311,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..e04442345fc0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 5f25cfbd952e..5ee663836c08 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..7574ef5f16ec 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..570b70d3f604 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,582 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", + idx, lookup_num_blocks, ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", + idx, lookup_num_blocks, start, stop, ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %p\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %p\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%p\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..f05f00acac89 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +149,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..fb2ddcdf7c73 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -142,7 +142,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 811e4ddb3f37..98491521bb43 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -981,59 +981,6 @@ void __ref xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1043,10 +990,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1244,7 +1187,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..dcd31fa39b5d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +10,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..1200e262a116 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +12,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..3a3b6a211584 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,7 +14,7 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp), %rcx @@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +57,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +72,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d5004477db6..70301ac0d414 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); |