diff options
Diffstat (limited to 'arch/x86/entry/vdso')
-rw-r--r-- | arch/x86/entry/vdso/vclock_gettime.c | 151 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso-layout.lds.S | 3 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso2c.c | 3 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32/system_call.S | 54 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 14 |
5 files changed, 135 insertions, 90 deletions
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index ca94fa649251..1a50e09c945b 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -17,8 +17,10 @@ #include <asm/vvar.h> #include <asm/unistd.h> #include <asm/msr.h> +#include <asm/pvclock.h> #include <linux/math64.h> #include <linux/time.h> +#include <linux/kernel.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void) } #endif -#ifndef BUILD_VDSO32 +#ifdef CONFIG_PARAVIRT_CLOCK +extern u8 pvclock_page + __attribute__((visibility("hidden"))); +#endif -#include <linux/kernel.h> -#include <asm/vsyscall.h> -#include <asm/fixmap.h> -#include <asm/pvclock.h> +#ifndef BUILD_VDSO32 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { @@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) -{ - const struct pvclock_vsyscall_time_info *pvti_base; - int idx = cpu / (PAGE_SIZE/PVTI_SIZE); - int offset = cpu % (PAGE_SIZE/PVTI_SIZE); - - BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); - - pvti_base = (struct pvclock_vsyscall_time_info *) - __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); - - return &pvti_base[offset]; -} - -static notrace cycle_t vread_pvclock(int *mode) -{ - const struct pvclock_vsyscall_time_info *pvti; - cycle_t ret; - u64 last; - u32 version; - u8 flags; - unsigned cpu, cpu1; - - - /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * - */ - do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) - *mode = VCLOCK_NONE; - - /* refer to tsc.c read_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; -} -#endif #else @@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } +#endif + #ifdef CONFIG_PARAVIRT_CLOCK +static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) +{ + return (const struct pvclock_vsyscall_time_info *)&pvclock_page; +} static notrace cycle_t vread_pvclock(int *mode) { - *mode = VCLOCK_NONE; - return 0; -} -#endif + const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; + cycle_t ret; + u64 tsc, pvti_tsc; + u64 last, delta, pvti_system_time; + u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift; + /* + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. + */ + + do { + version = pvti->version; + + smp_rmb(); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + + tsc = rdtsc_ordered(); + pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; + pvti_tsc_shift = pvti->tsc_shift; + pvti_system_time = pvti->system_time; + pvti_tsc = pvti->tsc_timestamp; + + /* Make sure that the version double-check is last. */ + smp_rmb(); + } while (unlikely((version & 1) || version != pvti->version)); + + delta = tsc - pvti_tsc; + ret = pvti_system_time + + pvclock_scale_delta(delta, pvti_tsc_to_system_mul, + pvti_tsc_shift); + + /* refer to vread_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} #endif notrace static cycle_t vread_tsc(void) diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index de2c921025f5..4158acc17df0 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 2 * PAGE_SIZE; + vvar_start = . - 3 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -36,6 +36,7 @@ SECTIONS #undef EMIT_VVAR hpet_page = vvar_start + PAGE_SIZE; + pvclock_page = vvar_start + 2 * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 785d9922b106..491020b2826d 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -73,6 +73,7 @@ enum { sym_vvar_start, sym_vvar_page, sym_hpet_page, + sym_pvclock_page, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -80,6 +81,7 @@ enum { const int special_pages[] = { sym_vvar_page, sym_hpet_page, + sym_pvclock_page, }; struct vdso_sym { @@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, + [sym_pvclock_page] = {"pvclock_page", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 93bd8452383f..3a1d9297074b 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,5 +1,5 @@ /* - * Code for the vDSO. This version uses the old int $0x80 method. + * AT_SYSINFO entry point */ #include <asm/dwarf2.h> @@ -21,35 +21,67 @@ __kernel_vsyscall: /* * Reshuffle regs so that all of any of the entry instructions * will preserve enough state. + * + * A really nice entry sequence would be: + * pushl %edx + * pushl %ecx + * movl %esp, %ecx + * + * Unfortunately, naughty Android versions between July and December + * 2015 actually hardcode the traditional Linux SYSENTER entry + * sequence. That is severely broken for a number of reasons (ask + * anyone with an AMD CPU, for example). Nonetheless, we try to keep + * it working approximately as well as it ever worked. + * + * This link may eludicate some of the history: + * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 + * personally, I find it hard to understand what's going on there. + * + * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. + * Execute an indirect call to the address in the AT_SYSINFO auxv + * entry. That is the ONLY correct way to make a fast 32-bit system + * call on Linux. (Open-coding int $0x80 is also fine, but it's + * slow.) */ + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - pushl %ecx + pushl %ebp CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - movl %esp, %ecx + CFI_REL_OFFSET ebp, 0 + + #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" + #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" #ifdef CONFIG_X86_64 /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ - ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ - "syscall", X86_FEATURE_SYSCALL32 + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 #else - ALTERNATIVE "", "sysenter", X86_FEATURE_SEP + ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP #endif /* Enter using int $0x80 */ - movl (%esp), %ecx int $0x80 GLOBAL(int80_landing_pad) - /* Restore ECX and EDX in case they were clobbered. */ - popl %ecx - CFI_RESTORE ecx + /* + * Restore EDX and ECX in case they were clobbered. EBP is not + * clobbered (the kernel restores it), but it's cleaner and + * probably faster to pop it than to adjust ESP using addl. + */ + popl %ebp + CFI_RESTORE ebp CFI_ADJUST_CFA_OFFSET -4 popl %edx CFI_RESTORE edx CFI_ADJUST_CFA_OFFSET -4 + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 64df47148160..b8f69e264ac4 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -12,6 +12,7 @@ #include <linux/random.h> #include <linux/elf.h> #include <linux/cpu.h> +#include <asm/pvclock.h> #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> @@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) .name = "[vvar]", .pages = no_pages, }; + struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) } #endif + pvti = pvclock_pvti_cpu0_va(); + if (pvti && image->sym_pvclock_page) { + ret = remap_pfn_range(vma, + text_start + image->sym_pvclock_page, + __pa(pvti) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + } + up_fail: if (ret) current->mm->context.vdso = NULL; |