diff options
Diffstat (limited to 'arch')
162 files changed, 2553 insertions, 1299 deletions
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 91cf4055acab..bd4670d1b89b 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -313,11 +313,11 @@ config ARC_PAGE_SIZE_8K config ARC_PAGE_SIZE_16K bool "16KB" - depends on ARC_MMU_V3 + depends on ARC_MMU_V3 || ARC_MMU_V4 config ARC_PAGE_SIZE_4K bool "4KB" - depends on ARC_MMU_V3 + depends on ARC_MMU_V3 || ARC_MMU_V4 endchoice @@ -365,6 +365,11 @@ config ARC_HAS_LLSC default y depends on !ARC_CANT_LLSC +config ARC_STAR_9000923308 + bool "Workaround for llock/scond livelock" + default y + depends on ISA_ARCV2 && SMP && ARC_HAS_LLSC + config ARC_HAS_SWAPE bool "Insn: SWAPE (endian-swap)" default y @@ -379,6 +384,10 @@ config ARC_HAS_LL64 dest operands with 2 possible source operands. default y +config ARC_HAS_DIV_REM + bool "Insn: div, divu, rem, remu" + default y + config ARC_HAS_RTC bool "Local 64-bit r/o cycle counter" default n diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 46d87310220d..8a27a48304a4 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -36,8 +36,16 @@ cflags-$(atleast_gcc44) += -fsection-anchors cflags-$(CONFIG_ARC_HAS_LLSC) += -mlock cflags-$(CONFIG_ARC_HAS_SWAPE) += -mswape +ifdef CONFIG_ISA_ARCV2 + ifndef CONFIG_ARC_HAS_LL64 -cflags-$(CONFIG_ISA_ARCV2) += -mno-ll64 +cflags-y += -mno-ll64 +endif + +ifndef CONFIG_ARC_HAS_DIV_REM +cflags-y += -mno-div-rem +endif + endif cflags-$(CONFIG_ARC_DW2_UNWIND) += -fasynchronous-unwind-tables diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 070f58827a5c..c8f57b8449dc 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -89,11 +89,10 @@ #define ECR_C_BIT_DTLB_LD_MISS 8 #define ECR_C_BIT_DTLB_ST_MISS 9 - /* Auxiliary registers */ #define AUX_IDENTITY 4 #define AUX_INTR_VEC_BASE 0x25 - +#define AUX_NON_VOL 0x5e /* * Floating Pt Registers @@ -240,9 +239,9 @@ struct bcr_extn_xymem { struct bcr_perip { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int start:8, pad2:8, sz:8, pad:8; + unsigned int start:8, pad2:8, sz:8, ver:8; #else - unsigned int pad:8, sz:8, pad2:8, start:8; + unsigned int ver:8, sz:8, pad2:8, start:8; #endif }; diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 03484cb4d16d..87d18ae53115 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -23,33 +23,60 @@ #define atomic_set(v, i) (((v)->counter) = (i)) -#ifdef CONFIG_ISA_ARCV2 -#define PREFETCHW " prefetchw [%1] \n" -#else -#define PREFETCHW +#ifdef CONFIG_ARC_STAR_9000923308 + +#define SCOND_FAIL_RETRY_VAR_DEF \ + unsigned int delay = 1, tmp; \ + +#define SCOND_FAIL_RETRY_ASM \ + " bz 4f \n" \ + " ; --- scond fail delay --- \n" \ + " mov %[tmp], %[delay] \n" /* tmp = delay */ \ + "2: brne.d %[tmp], 0, 2b \n" /* while (tmp != 0) */ \ + " sub %[tmp], %[tmp], 1 \n" /* tmp-- */ \ + " rol %[delay], %[delay] \n" /* delay *= 2 */ \ + " b 1b \n" /* start over */ \ + "4: ; --- success --- \n" \ + +#define SCOND_FAIL_RETRY_VARS \ + ,[delay] "+&r" (delay),[tmp] "=&r" (tmp) \ + +#else /* !CONFIG_ARC_STAR_9000923308 */ + +#define SCOND_FAIL_RETRY_VAR_DEF + +#define SCOND_FAIL_RETRY_ASM \ + " bnz 1b \n" \ + +#define SCOND_FAIL_RETRY_VARS + #endif #define ATOMIC_OP(op, c_op, asm_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ - unsigned int temp; \ + unsigned int val; \ + SCOND_FAIL_RETRY_VAR_DEF \ \ __asm__ __volatile__( \ - "1: \n" \ - PREFETCHW \ - " llock %0, [%1] \n" \ - " " #asm_op " %0, %0, %2 \n" \ - " scond %0, [%1] \n" \ - " bnz 1b \n" \ - : "=&r"(temp) /* Early clobber, to prevent reg reuse */ \ - : "r"(&v->counter), "ir"(i) \ + "1: llock %[val], [%[ctr]] \n" \ + " " #asm_op " %[val], %[val], %[i] \n" \ + " scond %[val], [%[ctr]] \n" \ + " \n" \ + SCOND_FAIL_RETRY_ASM \ + \ + : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \ + SCOND_FAIL_RETRY_VARS \ + : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \ + [i] "ir" (i) \ : "cc"); \ } \ #define ATOMIC_OP_RETURN(op, c_op, asm_op) \ static inline int atomic_##op##_return(int i, atomic_t *v) \ { \ - unsigned int temp; \ + unsigned int val; \ + SCOND_FAIL_RETRY_VAR_DEF \ \ /* \ * Explicit full memory barrier needed before/after as \ @@ -58,19 +85,21 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ smp_mb(); \ \ __asm__ __volatile__( \ - "1: \n" \ - PREFETCHW \ - " llock %0, [%1] \n" \ - " " #asm_op " %0, %0, %2 \n" \ - " scond %0, [%1] \n" \ - " bnz 1b \n" \ - : "=&r"(temp) \ - : "r"(&v->counter), "ir"(i) \ + "1: llock %[val], [%[ctr]] \n" \ + " " #asm_op " %[val], %[val], %[i] \n" \ + " scond %[val], [%[ctr]] \n" \ + " \n" \ + SCOND_FAIL_RETRY_ASM \ + \ + : [val] "=&r" (val) \ + SCOND_FAIL_RETRY_VARS \ + : [ctr] "r" (&v->counter), \ + [i] "ir" (i) \ : "cc"); \ \ smp_mb(); \ \ - return temp; \ + return val; \ } #else /* !CONFIG_ARC_HAS_LLSC */ @@ -150,6 +179,9 @@ ATOMIC_OP(and, &=, and) #undef ATOMIC_OPS #undef ATOMIC_OP_RETURN #undef ATOMIC_OP +#undef SCOND_FAIL_RETRY_VAR_DEF +#undef SCOND_FAIL_RETRY_ASM +#undef SCOND_FAIL_RETRY_VARS /** * __atomic_add_unless - add unless the number is a given value diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index 91694ec1ce95..69095da1fcfd 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -20,20 +20,20 @@ struct pt_regs { /* Real registers */ - long bta; /* bta_l1, bta_l2, erbta */ + unsigned long bta; /* bta_l1, bta_l2, erbta */ - long lp_start, lp_end, lp_count; + unsigned long lp_start, lp_end, lp_count; - long status32; /* status32_l1, status32_l2, erstatus */ - long ret; /* ilink1, ilink2 or eret */ - long blink; - long fp; - long r26; /* gp */ + unsigned long status32; /* status32_l1, status32_l2, erstatus */ + unsigned long ret; /* ilink1, ilink2 or eret */ + unsigned long blink; + unsigned long fp; + unsigned long r26; /* gp */ - long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0; + unsigned long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0; - long sp; /* user/kernel sp depending on where we came from */ - long orig_r0; + unsigned long sp; /* User/Kernel depending on where we came from */ + unsigned long orig_r0; /* * To distinguish bet excp, syscall, irq @@ -55,13 +55,13 @@ struct pt_regs { unsigned long event; }; - long user_r25; + unsigned long user_r25; }; #else struct pt_regs { - long orig_r0; + unsigned long orig_r0; union { struct { @@ -76,26 +76,26 @@ struct pt_regs { unsigned long event; }; - long bta; /* bta_l1, bta_l2, erbta */ + unsigned long bta; /* bta_l1, bta_l2, erbta */ - long user_r25; + unsigned long user_r25; - long r26; /* gp */ - long fp; - long sp; /* user/kernel sp depending on where we came from */ + unsigned long r26; /* gp */ + unsigned long fp; + unsigned long sp; /* user/kernel sp depending on where we came from */ - long r12; + unsigned long r12; /*------- Below list auto saved by h/w -----------*/ - long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; - long blink; - long lp_end, lp_start, lp_count; + unsigned long blink; + unsigned long lp_end, lp_start, lp_count; - long ei, ldi, jli; + unsigned long ei, ldi, jli; - long ret; - long status32; + unsigned long ret; + unsigned long status32; }; #endif @@ -103,10 +103,10 @@ struct pt_regs { /* Callee saved registers - need to be saved only when you are scheduled out */ struct callee_regs { - long r25, r24, r23, r22, r21, r20, r19, r18, r17, r16, r15, r14, r13; + unsigned long r25, r24, r23, r22, r21, r20, r19, r18, r17, r16, r15, r14, r13; }; -#define instruction_pointer(regs) (unsigned long)((regs)->ret) +#define instruction_pointer(regs) ((regs)->ret) #define profile_pc(regs) instruction_pointer(regs) /* return 1 if user mode or 0 if kernel mode */ @@ -142,7 +142,7 @@ struct callee_regs { static inline long regs_return_value(struct pt_regs *regs) { - return regs->r0; + return (long)regs->r0; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index e1651df6a93d..db8c59d1eaeb 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -18,9 +18,518 @@ #define arch_spin_unlock_wait(x) \ do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0) +#ifdef CONFIG_ARC_HAS_LLSC + +/* + * A normal LLOCK/SCOND based system, w/o need for livelock workaround + */ +#ifndef CONFIG_ARC_STAR_9000923308 + static inline void arch_spin_lock(arch_spinlock_t *lock) { - unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__; + unsigned int val; + + smp_mb(); + + __asm__ __volatile__( + "1: llock %[val], [%[slock]] \n" + " breq %[val], %[LOCKED], 1b \n" /* spin while LOCKED */ + " scond %[LOCKED], [%[slock]] \n" /* acquire */ + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [slock] "r" (&(lock->slock)), + [LOCKED] "r" (__ARCH_SPIN_LOCK_LOCKED__) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_spin_trylock(arch_spinlock_t *lock) +{ + unsigned int val, got_it = 0; + + smp_mb(); + + __asm__ __volatile__( + "1: llock %[val], [%[slock]] \n" + " breq %[val], %[LOCKED], 4f \n" /* already LOCKED, just bail */ + " scond %[LOCKED], [%[slock]] \n" /* acquire */ + " bnz 1b \n" + " mov %[got_it], 1 \n" + "4: \n" + " \n" + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + : [slock] "r" (&(lock->slock)), + [LOCKED] "r" (__ARCH_SPIN_LOCK_LOCKED__) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_spin_unlock(arch_spinlock_t *lock) +{ + smp_mb(); + + lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__; + + smp_mb(); +} + +/* + * Read-write spinlocks, allowing multiple readers but only one writer. + * Unfair locking as Writers could be starved indefinitely by Reader(s) + */ + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + unsigned int val; + + smp_mb(); + + /* + * zero means writer holds the lock exclusively, deny Reader. + * Otherwise grant lock to first/subseq reader + * + * if (rw->counter > 0) { + * rw->counter--; + * ret = 1; + * } + */ + + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " brls %[val], %[WR_LOCKED], 1b\n" /* <= 0: spin while write locked */ + " sub %[val], %[val], 1 \n" /* reader lock */ + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [rwlock] "r" (&(rw->counter)), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_read_trylock(arch_rwlock_t *rw) +{ + unsigned int val, got_it = 0; + + smp_mb(); + + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " brls %[val], %[WR_LOCKED], 4f\n" /* <= 0: already write locked, bail */ + " sub %[val], %[val], 1 \n" /* counter-- */ + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" /* retry if collided with someone */ + " mov %[got_it], 1 \n" + " \n" + "4: ; --- done --- \n" + + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + : [rwlock] "r" (&(rw->counter)), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + unsigned int val; + + smp_mb(); + + /* + * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__), + * deny writer. Otherwise if unlocked grant to writer + * Hence the claim that Linux rwlocks are unfair to writers. + * (can be starved for an indefinite time by readers). + * + * if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) { + * rw->counter = 0; + * ret = 1; + * } + */ + + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " brne %[val], %[UNLOCKED], 1b \n" /* while !UNLOCKED spin */ + " mov %[val], %[WR_LOCKED] \n" + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [rwlock] "r" (&(rw->counter)), + [UNLOCKED] "ir" (__ARCH_RW_LOCK_UNLOCKED__), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + unsigned int val, got_it = 0; + + smp_mb(); + + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " brne %[val], %[UNLOCKED], 4f \n" /* !UNLOCKED, bail */ + " mov %[val], %[WR_LOCKED] \n" + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" /* retry if collided with someone */ + " mov %[got_it], 1 \n" + " \n" + "4: ; --- done --- \n" + + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + : [rwlock] "r" (&(rw->counter)), + [UNLOCKED] "ir" (__ARCH_RW_LOCK_UNLOCKED__), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) +{ + unsigned int val; + + smp_mb(); + + /* + * rw->counter++; + */ + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " add %[val], %[val], 1 \n" + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [rwlock] "r" (&(rw->counter)) + : "memory", "cc"); + + smp_mb(); +} + +static inline void arch_write_unlock(arch_rwlock_t *rw) +{ + smp_mb(); + + rw->counter = __ARCH_RW_LOCK_UNLOCKED__; + + smp_mb(); +} + +#else /* CONFIG_ARC_STAR_9000923308 */ + +/* + * HS38x4 could get into a LLOCK/SCOND livelock in case of multiple overlapping + * coherency transactions in the SCU. The exclusive line state keeps rotating + * among contenting cores leading to a never ending cycle. So break the cycle + * by deferring the retry of failed exclusive access (SCOND). The actual delay + * needed is function of number of contending cores as well as the unrelated + * coherency traffic from other cores. To keep the code simple, start off with + * small delay of 1 which would suffice most cases and in case of contention + * double the delay. Eventually the delay is sufficient such that the coherency + * pipeline is drained, thus a subsequent exclusive access would succeed. + */ + +#define SCOND_FAIL_RETRY_VAR_DEF \ + unsigned int delay, tmp; \ + +#define SCOND_FAIL_RETRY_ASM \ + " ; --- scond fail delay --- \n" \ + " mov %[tmp], %[delay] \n" /* tmp = delay */ \ + "2: brne.d %[tmp], 0, 2b \n" /* while (tmp != 0) */ \ + " sub %[tmp], %[tmp], 1 \n" /* tmp-- */ \ + " rol %[delay], %[delay] \n" /* delay *= 2 */ \ + " b 1b \n" /* start over */ \ + " \n" \ + "4: ; --- done --- \n" \ + +#define SCOND_FAIL_RETRY_VARS \ + ,[delay] "=&r" (delay), [tmp] "=&r" (tmp) \ + +static inline void arch_spin_lock(arch_spinlock_t *lock) +{ + unsigned int val; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[slock]] \n" + " breq %[val], %[LOCKED], 0b \n" /* spin while LOCKED */ + " scond %[LOCKED], [%[slock]] \n" /* acquire */ + " bz 4f \n" /* done */ + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val) + SCOND_FAIL_RETRY_VARS + : [slock] "r" (&(lock->slock)), + [LOCKED] "r" (__ARCH_SPIN_LOCK_LOCKED__) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_spin_trylock(arch_spinlock_t *lock) +{ + unsigned int val, got_it = 0; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[slock]] \n" + " breq %[val], %[LOCKED], 4f \n" /* already LOCKED, just bail */ + " scond %[LOCKED], [%[slock]] \n" /* acquire */ + " bz.d 4f \n" + " mov.z %[got_it], 1 \n" /* got it */ + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + SCOND_FAIL_RETRY_VARS + : [slock] "r" (&(lock->slock)), + [LOCKED] "r" (__ARCH_SPIN_LOCK_LOCKED__) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_spin_unlock(arch_spinlock_t *lock) +{ + smp_mb(); + + lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__; + + smp_mb(); +} + +/* + * Read-write spinlocks, allowing multiple readers but only one writer. + * Unfair locking as Writers could be starved indefinitely by Reader(s) + */ + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + unsigned int val; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + /* + * zero means writer holds the lock exclusively, deny Reader. + * Otherwise grant lock to first/subseq reader + * + * if (rw->counter > 0) { + * rw->counter--; + * ret = 1; + * } + */ + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[rwlock]] \n" + " brls %[val], %[WR_LOCKED], 0b\n" /* <= 0: spin while write locked */ + " sub %[val], %[val], 1 \n" /* reader lock */ + " scond %[val], [%[rwlock]] \n" + " bz 4f \n" /* done */ + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val) + SCOND_FAIL_RETRY_VARS + : [rwlock] "r" (&(rw->counter)), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_read_trylock(arch_rwlock_t *rw) +{ + unsigned int val, got_it = 0; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[rwlock]] \n" + " brls %[val], %[WR_LOCKED], 4f\n" /* <= 0: already write locked, bail */ + " sub %[val], %[val], 1 \n" /* counter-- */ + " scond %[val], [%[rwlock]] \n" + " bz.d 4f \n" + " mov.z %[got_it], 1 \n" /* got it */ + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + SCOND_FAIL_RETRY_VARS + : [rwlock] "r" (&(rw->counter)), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + unsigned int val; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + /* + * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__), + * deny writer. Otherwise if unlocked grant to writer + * Hence the claim that Linux rwlocks are unfair to writers. + * (can be starved for an indefinite time by readers). + * + * if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) { + * rw->counter = 0; + * ret = 1; + * } + */ + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[rwlock]] \n" + " brne %[val], %[UNLOCKED], 0b \n" /* while !UNLOCKED spin */ + " mov %[val], %[WR_LOCKED] \n" + " scond %[val], [%[rwlock]] \n" + " bz 4f \n" + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val) + SCOND_FAIL_RETRY_VARS + : [rwlock] "r" (&(rw->counter)), + [UNLOCKED] "ir" (__ARCH_RW_LOCK_UNLOCKED__), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); +} + +/* 1 - lock taken successfully */ +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + unsigned int val, got_it = 0; + SCOND_FAIL_RETRY_VAR_DEF; + + smp_mb(); + + __asm__ __volatile__( + "0: mov %[delay], 1 \n" + "1: llock %[val], [%[rwlock]] \n" + " brne %[val], %[UNLOCKED], 4f \n" /* !UNLOCKED, bail */ + " mov %[val], %[WR_LOCKED] \n" + " scond %[val], [%[rwlock]] \n" + " bz.d 4f \n" + " mov.z %[got_it], 1 \n" /* got it */ + " \n" + SCOND_FAIL_RETRY_ASM + + : [val] "=&r" (val), + [got_it] "+&r" (got_it) + SCOND_FAIL_RETRY_VARS + : [rwlock] "r" (&(rw->counter)), + [UNLOCKED] "ir" (__ARCH_RW_LOCK_UNLOCKED__), + [WR_LOCKED] "ir" (0) + : "memory", "cc"); + + smp_mb(); + + return got_it; +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) +{ + unsigned int val; + + smp_mb(); + + /* + * rw->counter++; + */ + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " add %[val], %[val], 1 \n" + " scond %[val], [%[rwlock]] \n" + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [rwlock] "r" (&(rw->counter)) + : "memory", "cc"); + + smp_mb(); +} + +static inline void arch_write_unlock(arch_rwlock_t *rw) +{ + unsigned int val; + + smp_mb(); + + /* + * rw->counter = __ARCH_RW_LOCK_UNLOCKED__; + */ + __asm__ __volatile__( + "1: llock %[val], [%[rwlock]] \n" + " scond %[UNLOCKED], [%[rwlock]]\n" + " bnz 1b \n" + " \n" + : [val] "=&r" (val) + : [rwlock] "r" (&(rw->counter)), + [UNLOCKED] "r" (__ARCH_RW_LOCK_UNLOCKED__) + : "memory", "cc"); + + smp_mb(); +} + +#undef SCOND_FAIL_RETRY_VAR_DEF +#undef SCOND_FAIL_RETRY_ASM +#undef SCOND_FAIL_RETRY_VARS + +#endif /* CONFIG_ARC_STAR_9000923308 */ + +#else /* !CONFIG_ARC_HAS_LLSC */ + +static inline void arch_spin_lock(arch_spinlock_t *lock) +{ + unsigned int val = __ARCH_SPIN_LOCK_LOCKED__; /* * This smp_mb() is technically superfluous, we only need the one @@ -33,7 +542,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) __asm__ __volatile__( "1: ex %0, [%1] \n" " breq %0, %2, 1b \n" - : "+&r" (tmp) + : "+&r" (val) : "r"(&(lock->slock)), "ir"(__ARCH_SPIN_LOCK_LOCKED__) : "memory"); @@ -48,26 +557,27 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) smp_mb(); } +/* 1 - lock taken successfully */ static inline int arch_spin_trylock(arch_spinlock_t *lock) { - unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__; + unsigned int val = __ARCH_SPIN_LOCK_LOCKED__; smp_mb(); __asm__ __volatile__( "1: ex %0, [%1] \n" - : "+r" (tmp) + : "+r" (val) : "r"(&(lock->slock)) : "memory"); smp_mb(); - return (tmp == __ARCH_SPIN_LOCK_UNLOCKED__); + return (val == __ARCH_SPIN_LOCK_UNLOCKED__); } static inline void arch_spin_unlock(arch_spinlock_t *lock) { - unsigned int tmp = __ARCH_SPIN_LOCK_UNLOCKED__; + unsigned int val = __ARCH_SPIN_LOCK_UNLOCKED__; /* * RELEASE barrier: given the instructions avail on ARCv2, full barrier @@ -77,7 +587,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __asm__ __volatile__( " ex %0, [%1] \n" - : "+r" (tmp) + : "+r" (val) : "r"(&(lock->slock)) : "memory"); @@ -90,19 +600,12 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) /* * Read-write spinlocks, allowing multiple readers but only one writer. + * Unfair locking as Writers could be starved indefinitely by Reader(s) * * The spinlock itself is contained in @counter and access to it is * serialized with @lock_mutex. - * - * Unfair locking as Writers could be starved indefinitely by Reader(s) */ -/* Would read_trylock() succeed? */ -#define arch_read_can_lock(x) ((x)->counter > 0) - -/* Would write_trylock() succeed? */ -#define arch_write_can_lock(x) ((x)->counter == __ARCH_RW_LOCK_UNLOCKED__) - /* 1 - lock taken successfully */ static inline int arch_read_trylock(arch_rwlock_t *rw) { @@ -173,6 +676,11 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) arch_spin_unlock(&(rw->lock_mutex)); } +#endif + +#define arch_read_can_lock(x) ((x)->counter > 0) +#define arch_write_can_lock(x) ((x)->counter == __ARCH_RW_LOCK_UNLOCKED__) + #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/arc/include/asm/spinlock_types.h b/arch/arc/include/asm/spinlock_types.h index 662627ced4f2..4e1ef5f650c6 100644 --- a/arch/arc/include/asm/spinlock_types.h +++ b/arch/arc/include/asm/spinlock_types.h @@ -26,7 +26,9 @@ typedef struct { */ typedef struct { volatile unsigned int counter; +#ifndef CONFIG_ARC_HAS_LLSC arch_spinlock_t lock_mutex; +#endif } arch_rwlock_t; #define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000 diff --git a/arch/arc/include/uapi/asm/ptrace.h b/arch/arc/include/uapi/asm/ptrace.h index 76a7739aab1c..0b3ef63d4a03 100644 --- a/arch/arc/include/uapi/asm/ptrace.h +++ b/arch/arc/include/uapi/asm/ptrace.h @@ -32,20 +32,20 @@ */ struct user_regs_struct { - long pad; + unsigned long pad; struct { - long bta, lp_start, lp_end, lp_count; - long status32, ret, blink, fp, gp; - long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0; - long sp; + unsigned long bta, lp_start, lp_end, lp_count; + unsigned long status32, ret, blink, fp, gp; + unsigned long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0; + unsigned long sp; } scratch; - long pad2; + unsigned long pad2; struct { - long r25, r24, r23, r22, r21, r20; - long r19, r18, r17, r16, r15, r14, r13; + unsigned long r25, r24, r23, r22, r21, r20; + unsigned long r19, r18, r17, r16, r15, r14, r13; } callee; - long efa; /* break pt addr, for break points in delay slots */ - long stop_pc; /* give dbg stop_pc after ensuring brkpt trap */ + unsigned long efa; /* break pt addr, for break points in delay slots */ + unsigned long stop_pc; /* give dbg stop_pc after ensuring brkpt trap */ }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 18cc01591c96..cabde9dc0696 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -47,6 +47,7 @@ static void read_arc_build_cfg_regs(void) struct bcr_perip uncached_space; struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; + unsigned long perip_space; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); @@ -56,7 +57,12 @@ static void read_arc_build_cfg_regs(void) cpu->vec_base = read_aux_reg(AUX_INTR_VEC_BASE); READ_BCR(ARC_REG_D_UNCACH_BCR, uncached_space); - BUG_ON((uncached_space.start << 24) != ARC_UNCACHED_ADDR_SPACE); + if (uncached_space.ver < 3) + perip_space = uncached_space.start << 24; + else + perip_space = read_aux_reg(AUX_NON_VOL) & 0xF0000000; + + BUG_ON(perip_space != ARC_UNCACHED_ADDR_SPACE); READ_BCR(ARC_REG_MUL_BCR, cpu->extn_mpy); @@ -330,6 +336,10 @@ static void arc_chk_core_config(void) pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n"); else if (!cpu->extn.fpu_dp && fpu_enabled) panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n"); + + if (is_isa_arcv2() && IS_ENABLED(CONFIG_SMP) && cpu->isa.atomic && + !IS_ENABLED(CONFIG_ARC_STAR_9000923308)) + panic("llock/scond livelock workaround missing\n"); } /* diff --git a/arch/arc/kernel/time.c b/arch/arc/kernel/time.c index 3364d2bbc515..4294761a2b3e 100644 --- a/arch/arc/kernel/time.c +++ b/arch/arc/kernel/time.c @@ -203,34 +203,24 @@ static int arc_clkevent_set_next_event(unsigned long delta, return 0; } -static void arc_clkevent_set_mode(enum clock_event_mode mode, - struct clock_event_device *dev) +static int arc_clkevent_set_periodic(struct clock_event_device *dev) { - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - /* - * At X Hz, 1 sec = 1000ms -> X cycles; - * 10ms -> X / 100 cycles - */ - arc_timer_event_setup(arc_get_core_freq() / HZ); - break; - case CLOCK_EVT_MODE_ONESHOT: - break; - default: - break; - } - - return; + /* + * At X Hz, 1 sec = 1000ms -> X cycles; + * 10ms -> X / 100 cycles + */ + arc_timer_event_setup(arc_get_core_freq() / HZ); + return 0; } static DEFINE_PER_CPU(struct clock_event_device, arc_clockevent_device) = { - .name = "ARC Timer0", - .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, - .mode = CLOCK_EVT_MODE_UNUSED, - .rating = 300, - .irq = TIMER0_IRQ, /* hardwired, no need for resources */ - .set_next_event = arc_clkevent_set_next_event, - .set_mode = arc_clkevent_set_mode, + .name = "ARC Timer0", + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERIODIC, + .rating = 300, + .irq = TIMER0_IRQ, /* hardwired, no need for resources */ + .set_next_event = arc_clkevent_set_next_event, + .set_state_periodic = arc_clkevent_set_periodic, }; static irqreturn_t timer_irq_handler(int irq, void *dev_id) @@ -240,7 +230,7 @@ static irqreturn_t timer_irq_handler(int irq, void *dev_id) * irq_set_chip_and_handler() asked for handle_percpu_devid_irq() */ struct clock_event_device *evt = this_cpu_ptr(&arc_clockevent_device); - int irq_reenable = evt->mode == CLOCK_EVT_MODE_PERIODIC; + int irq_reenable = clockevent_state_periodic(evt); /* * Any write to CTRL reg ACks the interrupt, we rewrite the diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S index 1b2b3acfed52..0cab0b8a57c5 100644 --- a/arch/arc/lib/memcpy-archs.S +++ b/arch/arc/lib/memcpy-archs.S @@ -206,7 +206,7 @@ unalignedOffby3: ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] - prefetch [r3, 32] ;Prefetch the next write location + prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 8) or r7, r7, r5 diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index 92d573c734b5..365b18364815 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -10,12 +10,6 @@ #undef PREALLOC_NOT_AVAIL -#ifdef PREALLOC_NOT_AVAIL -#define PREWRITE(A,B) prefetchw [(A),(B)] -#else -#define PREWRITE(A,B) prealloc [(A),(B)] -#endif - ENTRY(memset) prefetchw [r0] ; Prefetch the write location mov.f 0, r2 @@ -51,9 +45,15 @@ ENTRY(memset) ;;; Convert len to Dwords, unfold x8 lsr.f lp_count, lp_count, 6 + lpnz @.Lset64bytes ;; LOOP START - PREWRITE(r3, 64) ;Prefetch the next write location +#ifdef PREALLOC_NOT_AVAIL + prefetchw [r3, 64] ;Prefetch the next write location +#else + prealloc [r3, 64] +#endif +#ifdef CONFIG_ARC_HAS_LL64 std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] @@ -62,16 +62,45 @@ ENTRY(memset) std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] +#else + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] +#endif .Lset64bytes: lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes lpnz .Lset32bytes ;; LOOP START prefetchw [r3, 32] ;Prefetch the next write location +#ifdef CONFIG_ARC_HAS_LL64 std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] +#else + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] + st.ab r4, [r3, 4] +#endif .Lset32bytes: and.f lp_count, r2, 0x1F ;Last remaining 31 bytes diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index 99f7da513a48..e7769c3ab5f2 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -389,6 +389,21 @@ axs103_set_freq(unsigned int id, unsigned int fd, unsigned int od) static void __init axs103_early_init(void) { + /* + * AXS103 configurations for SMP/QUAD configurations share device tree + * which defaults to 90 MHz. However recent failures of Quad config + * revealed P&R timing violations so clamp it down to safe 50 MHz + * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack + * + * This hack is really hacky as of now. Fix it properly by getting the + * number of cores as return value of platform's early SMP callback + */ +#ifdef CONFIG_ARC_MCIP + unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F; + if (num_cores > 2) + arc_set_core_freq(50 * 1000000); +#endif + switch (arc_get_core_freq()/1000000) { case 33: axs103_set_freq(1, 1, 1); diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 07ab3d203916..7451b447cc2d 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -312,6 +312,9 @@ INSTALL_TARGETS = zinstall uinstall install PHONY += bzImage $(BOOT_TARGETS) $(INSTALL_TARGETS) +bootpImage uImage: zImage +zImage: Image + $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@ diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index 8f1e25bcecbd..1e29ccf77ea2 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -116,7 +116,7 @@ ranges = <0 0x2000 0x2000>; scm_conf: scm_conf@0 { - compatible = "syscon"; + compatible = "syscon", "simple-bus"; reg = <0x0 0x1400>; #address-cells = <1>; #size-cells = <1>; @@ -1140,6 +1140,7 @@ ctrl-module = <&omap_control_sata>; clocks = <&sys_clkin1>, <&sata_ref_clk>; clock-names = "sysclk", "refclk"; + syscon-pllreset = <&scm_conf 0x3fc>; #phy-cells = <0>; }; diff --git a/arch/arm/boot/dts/exynos3250.dtsi b/arch/arm/boot/dts/exynos3250.dtsi index d7201333e3bc..2db99433e17f 100644 --- a/arch/arm/boot/dts/exynos3250.dtsi +++ b/arch/arm/boot/dts/exynos3250.dtsi @@ -138,8 +138,8 @@ mipi_phy: video-phy@10020710 { compatible = "samsung,s5pv210-mipi-video-phy"; - reg = <0x10020710 8>; #phy-cells = <1>; + syscon = <&pmu_system_controller>; }; pd_cam: cam-power-domain@10023C00 { diff --git a/arch/arm/boot/dts/exynos4210-origen.dts b/arch/arm/boot/dts/exynos4210-origen.dts index e0abfc3324d1..e050d85cdacd 100644 --- a/arch/arm/boot/dts/exynos4210-origen.dts +++ b/arch/arm/boot/dts/exynos4210-origen.dts @@ -127,6 +127,10 @@ }; }; +&cpu0 { + cpu0-supply = <&buck1_reg>; +}; + &fimd { pinctrl-0 = <&lcd_en &lcd_clk &lcd_data24 &pwm0_out>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/exynos4210-trats.dts b/arch/arm/boot/dts/exynos4210-trats.dts index 98f3ce65cb9a..ba34886f8b65 100644 --- a/arch/arm/boot/dts/exynos4210-trats.dts +++ b/arch/arm/boot/dts/exynos4210-trats.dts @@ -188,6 +188,10 @@ }; }; +&cpu0 { + cpu0-supply = <&varm_breg>; +}; + &dsi_0 { vddcore-supply = <&vusb_reg>; vddio-supply = <&vmipi_reg>; diff --git a/arch/arm/boot/dts/exynos4210-universal_c210.dts b/arch/arm/boot/dts/exynos4210-universal_c210.dts index d4f2b11319dd..775892b2cc6a 100644 --- a/arch/arm/boot/dts/exynos4210-universal_c210.dts +++ b/arch/arm/boot/dts/exynos4210-universal_c210.dts @@ -548,6 +548,10 @@ }; }; +&cpu0 { + cpu0-supply = <&vdd_arm_reg>; +}; + &pinctrl_1 { hdmi_hpd: hdmi-hpd { samsung,pins = "gpx3-7"; diff --git a/arch/arm/boot/dts/exynos4210.dtsi b/arch/arm/boot/dts/exynos4210.dtsi index 10d3c173396e..3e5ba665d200 100644 --- a/arch/arm/boot/dts/exynos4210.dtsi +++ b/arch/arm/boot/dts/exynos4210.dtsi @@ -40,6 +40,18 @@ device_type = "cpu"; compatible = "arm,cortex-a9"; reg = <0x900>; + clocks = <&clock CLK_ARM_CLK>; + clock-names = "cpu"; + clock-latency = <160000>; + + operating-points = < + 1200000 1250000 + 1000000 1150000 + 800000 1075000 + 500000 975000 + 400000 975000 + 200000 950000 + >; cooling-min-level = <4>; cooling-max-level = <2>; #cooling-cells = <2>; /* min followed by max */ diff --git a/arch/arm/boot/dts/imx35.dtsi b/arch/arm/boot/dts/imx35.dtsi index b6478e97d6a7..e6540b5cfa4c 100644 --- a/arch/arm/boot/dts/imx35.dtsi +++ b/arch/arm/boot/dts/imx35.dtsi @@ -286,8 +286,8 @@ can1: can@53fe4000 { compatible = "fsl,imx35-flexcan", "fsl,p1010-flexcan"; reg = <0x53fe4000 0x1000>; - clocks = <&clks 33>; - clock-names = "ipg"; + clocks = <&clks 33>, <&clks 33>; + clock-names = "ipg", "per"; interrupts = <43>; status = "disabled"; }; @@ -295,8 +295,8 @@ can2: can@53fe8000 { compatible = "fsl,imx35-flexcan", "fsl,p1010-flexcan"; reg = <0x53fe8000 0x1000>; - clocks = <&clks 34>; - clock-names = "ipg"; + clocks = <&clks 34>, <&clks 34>; + clock-names = "ipg", "per"; interrupts = <44>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi index e6d13592080d..b57033e8c633 100644 --- a/arch/arm/boot/dts/imx6qdl.dtsi +++ b/arch/arm/boot/dts/imx6qdl.dtsi @@ -181,10 +181,10 @@ interrupt-names = "msi"; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 0x7>; - interrupt-map = <0 0 0 1 &intc GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 2 &intc GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 3 &intc GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 4 &intc GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>; + interrupt-map = <0 0 0 1 &gpc GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 2 &gpc GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 3 &gpc GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 4 &gpc GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clks IMX6QDL_CLK_PCIE_AXI>, <&clks IMX6QDL_CLK_LVDS1_GATE>, <&clks IMX6QDL_CLK_PCIE_REF_125M>; diff --git a/arch/arm/boot/dts/k2e-clocks.dtsi b/arch/arm/boot/dts/k2e-clocks.dtsi index 4773d6af66a0..d56d68fe7ffc 100644 --- a/arch/arm/boot/dts/k2e-clocks.dtsi +++ b/arch/arm/boot/dts/k2e-clocks.dtsi @@ -13,9 +13,8 @@ clocks { #clock-cells = <0>; compatible = "ti,keystone,main-pll-clock"; clocks = <&refclksys>; - reg = <0x02620350 4>, <0x02310110 4>; - reg-names = "control", "multiplier"; - fixed-postdiv = <2>; + reg = <0x02620350 4>, <0x02310110 4>, <0x02310108 4>; + reg-names = "control", "multiplier", "post-divider"; }; papllclk: papllclk@2620358 { diff --git a/arch/arm/boot/dts/k2e.dtsi b/arch/arm/boot/dts/k2e.dtsi index 1b6494fbdb91..675fb8e492c6 100644 --- a/arch/arm/boot/dts/k2e.dtsi +++ b/arch/arm/boot/dts/k2e.dtsi @@ -131,10 +131,17 @@ <GIC_SPI 376 IRQ_TYPE_EDGE_RISING>; }; }; + + mdio: mdio@24200f00 { + compatible = "ti,keystone_mdio", "ti,davinci_mdio"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x24200f00 0x100>; + status = "disabled"; + clocks = <&clkcpgmac>; + clock-names = "fck"; + bus_freq = <2500000>; + }; /include/ "k2e-netcp.dtsi" }; }; - -&mdio { - reg = <0x24200f00 0x100>; -}; diff --git a/arch/arm/boot/dts/k2hk-clocks.dtsi b/arch/arm/boot/dts/k2hk-clocks.dtsi index d5adee3c0067..af9b7190533a 100644 --- a/arch/arm/boot/dts/k2hk-clocks.dtsi +++ b/arch/arm/boot/dts/k2hk-clocks.dtsi @@ -22,9 +22,8 @@ clocks { #clock-cells = <0>; compatible = "ti,keystone,main-pll-clock"; clocks = <&refclksys>; - reg = <0x02620350 4>, <0x02310110 4>; - reg-names = "control", "multiplier"; - fixed-postdiv = <2>; + reg = <0x02620350 4>, <0x02310110 4>, <0x02310108 4>; + reg-names = "control", "multiplier", "post-divider"; }; papllclk: papllclk@2620358 { diff --git a/arch/arm/boot/dts/k2hk.dtsi b/arch/arm/boot/dts/k2hk.dtsi index ae6472407b22..d0810a5f2968 100644 --- a/arch/arm/boot/dts/k2hk.dtsi +++ b/arch/arm/boot/dts/k2hk.dtsi @@ -98,6 +98,17 @@ #gpio-cells = <2>; gpio,syscon-dev = <&devctrl 0x25c>; }; + + mdio: mdio@02090300 { + compatible = "ti,keystone_mdio", "ti,davinci_mdio"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x02090300 0x100>; + status = "disabled"; + clocks = <&clkcpgmac>; + clock-names = "fck"; + bus_freq = <2500000>; + }; /include/ "k2hk-netcp.dtsi" }; }; diff --git a/arch/arm/boot/dts/k2l-clocks.dtsi b/arch/arm/boot/dts/k2l-clocks.dtsi index eb1e3e29f073..ef8464bb11ff 100644 --- a/arch/arm/boot/dts/k2l-clocks.dtsi +++ b/arch/arm/boot/dts/k2l-clocks.dtsi @@ -22,9 +22,8 @@ clocks { #clock-cells = <0>; compatible = "ti,keystone,main-pll-clock"; clocks = <&refclksys>; - reg = <0x02620350 4>, <0x02310110 4>; - reg-names = "control", "multiplier"; - fixed-postdiv = <2>; + reg = <0x02620350 4>, <0x02310110 4>, <0x02310108 4>; + reg-names = "control", "multiplier", "post-divider"; }; papllclk: papllclk@2620358 { diff --git a/arch/arm/boot/dts/k2l.dtsi b/arch/arm/boot/dts/k2l.dtsi index 0e007483615e..49fd414f680c 100644 --- a/arch/arm/boot/dts/k2l.dtsi +++ b/arch/arm/boot/dts/k2l.dtsi @@ -29,7 +29,6 @@ }; soc { - /include/ "k2l-clocks.dtsi" uart2: serial@02348400 { @@ -79,6 +78,17 @@ #gpio-cells = <2>; gpio,syscon-dev = <&devctrl 0x24c>; }; + + mdio: mdio@26200f00 { + compatible = "ti,keystone_mdio", "ti,davinci_mdio"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x26200f00 0x100>; + status = "disabled"; + clocks = <&clkcpgmac>; + clock-names = "fck"; + bus_freq = <2500000>; + }; /include/ "k2l-netcp.dtsi" }; }; @@ -96,7 +106,3 @@ /* Pin muxed. Enabled and configured by Bootloader */ status = "disabled"; }; - -&mdio { - reg = <0x26200f00 0x100>; -}; diff --git a/arch/arm/boot/dts/keystone.dtsi b/arch/arm/boot/dts/keystone.dtsi index e7a6f6deabb6..72816d65f7ec 100644 --- a/arch/arm/boot/dts/keystone.dtsi +++ b/arch/arm/boot/dts/keystone.dtsi @@ -267,17 +267,6 @@ 1 0 0x21000A00 0x00000100>; }; - mdio: mdio@02090300 { - compatible = "ti,keystone_mdio", "ti,davinci_mdio"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0x02090300 0x100>; - status = "disabled"; - clocks = <&clkpa>; - clock-names = "fck"; - bus_freq = <2500000>; - }; - kirq0: keystone_irq@26202a0 { compatible = "ti,keystone-irq"; interrupts = <GIC_SPI 4 IRQ_TYPE_EDGE_RISING>; diff --git a/arch/arm/boot/dts/omap2430.dtsi b/arch/arm/boot/dts/omap2430.dtsi index 11a7963be003..2390f387c271 100644 --- a/arch/arm/boot/dts/omap2430.dtsi +++ b/arch/arm/boot/dts/omap2430.dtsi @@ -51,7 +51,8 @@ }; scm_conf: scm_conf@270 { - compatible = "syscon"; + compatible = "syscon", + "simple-bus"; reg = <0x270 0x240>; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/omap4.dtsi b/arch/arm/boot/dts/omap4.dtsi index 7d31c6ff246f..abc4473e6f8a 100644 --- a/arch/arm/boot/dts/omap4.dtsi +++ b/arch/arm/boot/dts/omap4.dtsi @@ -191,7 +191,8 @@ }; omap4_padconf_global: omap4_padconf_global@5a0 { - compatible = "syscon"; + compatible = "syscon", + "simple-bus"; reg = <0x5a0 0x170>; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/omap5.dtsi b/arch/arm/boot/dts/omap5.dtsi index c8fd648a7108..b1a1263e6001 100644 --- a/arch/arm/boot/dts/omap5.dtsi +++ b/arch/arm/boot/dts/omap5.dtsi @@ -180,7 +180,8 @@ }; omap5_padconf_global: omap5_padconf_global@5a0 { - compatible = "syscon"; + compatible = "syscon", + "simple-bus"; reg = <0x5a0 0xec>; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts b/arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts index bd35b0674ff6..9bc72a3356e4 100644 --- a/arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts +++ b/arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts @@ -17,3 +17,13 @@ status = "ok"; }; }; + +&spmi_bus { + pm8941@0 { + coincell@2800 { + status = "ok"; + qcom,rset-ohms = <2100>; + qcom,vset-millivolts = <3000>; + }; + }; +}; diff --git a/arch/arm/boot/dts/qcom-pm8941.dtsi b/arch/arm/boot/dts/qcom-pm8941.dtsi index aa774e685018..968f1043d4f5 100644 --- a/arch/arm/boot/dts/qcom-pm8941.dtsi +++ b/arch/arm/boot/dts/qcom-pm8941.dtsi @@ -125,6 +125,12 @@ interrupts = <0x0 0x36 0x0 IRQ_TYPE_EDGE_RISING>; qcom,external-resistor-micro-ohms = <10000>; }; + + coincell@2800 { + compatible = "qcom,pm8941-coincell"; + reg = <0x2800>; + status = "disabled"; + }; }; usid1: pm8941@1 { diff --git a/arch/arm/boot/dts/ste-dbx5x0.dtsi b/arch/arm/boot/dts/ste-dbx5x0.dtsi index a75f3289e653..b8f81fb418ce 100644 --- a/arch/arm/boot/dts/ste-dbx5x0.dtsi +++ b/arch/arm/boot/dts/ste-dbx5x0.dtsi @@ -15,6 +15,33 @@ #include "skeleton.dtsi" / { + cpus { + #address-cells = <1>; + #size-cells = <0>; + enable-method = "ste,dbx500-smp"; + + cpu-map { + cluster0 { + core0 { + cpu = <&CPU0>; + }; + core1 { + cpu = <&CPU1>; + }; + }; + }; + CPU0: cpu@300 { + device_type = "cpu"; + compatible = "arm,cortex-a9"; + reg = <0x300>; + }; + CPU1: cpu@301 { + device_type = "cpu"; + compatible = "arm,cortex-a9"; + reg = <0x301>; + }; + }; + soc { #address-cells = <1>; #size-cells = <1>; @@ -22,32 +49,6 @@ interrupt-parent = <&intc>; ranges; - cpus { - #address-cells = <1>; - #size-cells = <0>; - - cpu-map { - cluster0 { - core0 { - cpu = <&CPU0>; - }; - core1 { - cpu = <&CPU1>; - }; - }; - }; - CPU0: cpu@0 { - device_type = "cpu"; - compatible = "arm,cortex-a9"; - reg = <0>; - }; - CPU1: cpu@1 { - device_type = "cpu"; - compatible = "arm,cortex-a9"; - reg = <1>; - }; - }; - ptm@801ae000 { compatible = "arm,coresight-etm3x", "arm,primecell"; reg = <0x801ae000 0x1000>; diff --git a/arch/arm/boot/dts/ste-nomadik-nhk15.dts b/arch/arm/boot/dts/ste-nomadik-nhk15.dts index 3d0b8755caee..3d25dba143a5 100644 --- a/arch/arm/boot/dts/ste-nomadik-nhk15.dts +++ b/arch/arm/boot/dts/ste-nomadik-nhk15.dts @@ -17,6 +17,7 @@ }; aliases { + serial1 = &uart1; stmpe-i2c0 = &stmpe0; stmpe-i2c1 = &stmpe1; }; diff --git a/arch/arm/boot/dts/ste-nomadik-s8815.dts b/arch/arm/boot/dts/ste-nomadik-s8815.dts index 85d3b95dfdba..3c140d05f796 100644 --- a/arch/arm/boot/dts/ste-nomadik-s8815.dts +++ b/arch/arm/boot/dts/ste-nomadik-s8815.dts @@ -15,6 +15,10 @@ bootargs = "root=/dev/ram0 console=ttyAMA1,115200n8 earlyprintk"; }; + aliases { + serial1 = &uart1; + }; + src@101e0000 { /* These chrystal drivers are not used on this board */ disable-sxtalo; diff --git a/arch/arm/boot/dts/ste-nomadik-stn8815.dtsi b/arch/arm/boot/dts/ste-nomadik-stn8815.dtsi index 9a5f2ba139b7..ef794a33b4dc 100644 --- a/arch/arm/boot/dts/ste-nomadik-stn8815.dtsi +++ b/arch/arm/boot/dts/ste-nomadik-stn8815.dtsi @@ -757,6 +757,7 @@ clock-names = "uartclk", "apb_pclk"; pinctrl-names = "default"; pinctrl-0 = <&uart0_default_mux>; + status = "disabled"; }; uart1: uart@101fb000 { diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 92828a1dec80..b48dd4f37f80 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -61,6 +61,7 @@ work_pending: movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE) ldmia sp, {r0 - r6} @ have to reload r0 - r6 b local_restart @ ... and off we go +ENDPROC(ret_fast_syscall) /* * "slow" syscall return path. "why" tells us if this was a real syscall. diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index bd755d97e459..29e2991465cb 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -399,6 +399,9 @@ ENTRY(secondary_startup) sub lr, r4, r5 @ mmu has been enabled add r3, r7, lr ldrd r4, [r3, #0] @ get secondary_data.pgdir +ARM_BE8(eor r4, r4, r5) @ Swap r5 and r4 in BE: +ARM_BE8(eor r5, r4, r5) @ it can be done in 3 steps +ARM_BE8(eor r4, r4, r5) @ without using a temp reg. ldr r8, [r3, #8] @ get secondary_data.swapper_pg_dir badr lr, __enable_mmu @ return address mov r13, r12 @ __secondary_switched address diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index efe17dd9b921..54a5aeab988d 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -296,7 +296,6 @@ static bool tk_is_cntvct(const struct timekeeper *tk) */ void update_vsyscall(struct timekeeper *tk) { - struct timespec xtime_coarse; struct timespec64 *wtm = &tk->wall_to_monotonic; if (!cntvct_ok) { @@ -308,10 +307,10 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdso_data); - xtime_coarse = __current_kernel_time(); vdso_data->tk_is_cntvct = tk_is_cntvct(tk); - vdso_data->xtime_coarse_sec = xtime_coarse.tv_sec; - vdso_data->xtime_coarse_nsec = xtime_coarse.tv_nsec; + vdso_data->xtime_coarse_sec = tk->xtime_sec; + vdso_data->xtime_coarse_nsec = (u32)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); vdso_data->wtm_clock_sec = wtm->tv_sec; vdso_data->wtm_clock_nsec = wtm->tv_nsec; diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 3e58d710013c..4b39af2dfda9 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -96,7 +96,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) } /* the mmap semaphore is taken only if not in an atomic context */ - atomic = in_atomic(); + atomic = faulthandler_disabled(); if (!atomic) down_read(¤t->mm->mmap_sem); diff --git a/arch/arm/mach-exynos/pm_domains.c b/arch/arm/mach-exynos/pm_domains.c index 6001f1c9d136..4a87e86dec45 100644 --- a/arch/arm/mach-exynos/pm_domains.c +++ b/arch/arm/mach-exynos/pm_domains.c @@ -146,9 +146,8 @@ static __init int exynos4_pm_init_power_domain(void) pd->base = of_iomap(np, 0); if (!pd->base) { pr_warn("%s: failed to map memory\n", __func__); - kfree(pd->pd.name); + kfree_const(pd->pd.name); kfree(pd); - of_node_put(np); continue; } diff --git a/arch/arm/mach-omap2/omap-wakeupgen.c b/arch/arm/mach-omap2/omap-wakeupgen.c index 8e52621b5a6b..e1d2e991d17a 100644 --- a/arch/arm/mach-omap2/omap-wakeupgen.c +++ b/arch/arm/mach-omap2/omap-wakeupgen.c @@ -392,6 +392,7 @@ static struct irq_chip wakeupgen_chip = { .irq_mask = wakeupgen_mask, .irq_unmask = wakeupgen_unmask, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_type = irq_chip_set_type_parent, .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, #ifdef CONFIG_SMP .irq_set_affinity = irq_chip_set_affinity_parent, diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index d78c12e7cb5e..486cc4ded190 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -2373,6 +2373,9 @@ static int of_dev_hwmod_lookup(struct device_node *np, * registers. This address is needed early so the OCP registers that * are part of the device's address space can be ioremapped properly. * + * If SYSC access is not needed, the registers will not be remapped + * and non-availability of MPU access is not treated as an error. + * * Returns 0 on success, -EINVAL if an invalid hwmod is passed, and * -ENXIO on absent or invalid register target address space. */ @@ -2387,6 +2390,11 @@ static int __init _init_mpu_rt_base(struct omap_hwmod *oh, void *data, _save_mpu_port_index(oh); + /* if we don't need sysc access we don't need to ioremap */ + if (!oh->class->sysc) + return 0; + + /* we can't continue without MPU PORT if we need sysc access */ if (oh->_int_flags & _HWMOD_NO_MPU_PORT) return -ENXIO; @@ -2396,8 +2404,10 @@ static int __init _init_mpu_rt_base(struct omap_hwmod *oh, void *data, oh->name); /* Extract the IO space from device tree blob */ - if (!np) + if (!np) { + pr_err("omap_hwmod: %s: no dt node\n", oh->name); return -ENXIO; + } va_start = of_iomap(np, index + oh->mpu_rt_idx); } else { @@ -2456,13 +2466,11 @@ static int __init _init(struct omap_hwmod *oh, void *data) oh->name, np->name); } - if (oh->class->sysc) { - r = _init_mpu_rt_base(oh, NULL, index, np); - if (r < 0) { - WARN(1, "omap_hwmod: %s: doesn't have mpu register target base\n", - oh->name); - return 0; - } + r = _init_mpu_rt_base(oh, NULL, index, np); + if (r < 0) { + WARN(1, "omap_hwmod: %s: doesn't have mpu register target base\n", + oh->name); + return 0; } r = _init_clocks(oh, NULL); diff --git a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c index 2606c6608bd8..562247bced49 100644 --- a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c @@ -827,8 +827,7 @@ static struct omap_hwmod_class_sysconfig dra7xx_gpmc_sysc = { .syss_offs = 0x0014, .sysc_flags = (SYSC_HAS_AUTOIDLE | SYSC_HAS_SIDLEMODE | SYSC_HAS_SOFTRESET | SYSS_HAS_RESET_STATUS), - .idlemodes = (SIDLE_FORCE | SIDLE_NO | SIDLE_SMART | - SIDLE_SMART_WKUP), + .idlemodes = (SIDLE_FORCE | SIDLE_NO | SIDLE_SMART), .sysc_fields = &omap_hwmod_sysc_type1, }; @@ -844,7 +843,7 @@ static struct omap_hwmod dra7xx_gpmc_hwmod = { .class = &dra7xx_gpmc_hwmod_class, .clkdm_name = "l3main1_clkdm", /* Skip reset for CONFIG_OMAP_GPMC_DEBUG for bootloader timings */ - .flags = HWMOD_SWSUP_SIDLE | DEBUG_OMAP_GPMC_HWMOD_FLAGS, + .flags = DEBUG_OMAP_GPMC_HWMOD_FLAGS, .main_clk = "l3_iclk_div", .prcm = { .omap4 = { diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 9d259d94e429..1160434eece0 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -14,7 +14,7 @@ VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 VDSO_LDFLAGS += -nostdlib -shared VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id) -VDSO_LDFLAGS += $(call cc-option, -fuse-ld=bfd) +VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd) obj-$(CONFIG_VDSO) += vdso.o extra-$(CONFIG_VDSO) += vdso.lds diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi index 0689c3fb56e3..58093edeea2e 100644 --- a/arch/arm64/boot/dts/apm/apm-storm.dtsi +++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi @@ -823,7 +823,7 @@ device_type = "dma"; reg = <0x0 0x1f270000 0x0 0x10000>, <0x0 0x1f200000 0x0 0x10000>, - <0x0 0x1b008000 0x0 0x2000>, + <0x0 0x1b000000 0x0 0x400000>, <0x0 0x1054a000 0x0 0x100>; interrupts = <0x0 0x82 0x4>, <0x0 0xb8 0x4>, diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 9d4aa18f2a82..e8ca6eaedd02 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -122,12 +122,12 @@ static int __init uefi_init(void) /* Show what we know for posterity */ c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), - sizeof(vendor)); + sizeof(vendor) * sizeof(efi_char16_t)); if (c16) { for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) vendor[i] = c16[i]; vendor[i] = '\0'; - early_memunmap(c16, sizeof(vendor)); + early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); } pr_info("EFI v%u.%.02u by %s\n", diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 1670f15ef69e..948f0ad2de23 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -168,7 +168,8 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) * Other callers might not initialize the si_lsb field, * so check explicitely for the right codes here. */ - if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif break; @@ -201,8 +202,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { - memset(to, 0, sizeof *to); - if (copy_from_user(to, from, __ARCH_SI_PREAMBLE_SIZE) || copy_from_user(to->_sifields._pad, from->_sifields._pad, SI_PAD_SIZE)) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index ec37ab3f524f..97bc68f4c689 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -199,16 +199,15 @@ up_fail: */ void update_vsyscall(struct timekeeper *tk) { - struct timespec xtime_coarse; u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter"); ++vdso_data->tb_seq_count; smp_wmb(); - xtime_coarse = __current_kernel_time(); vdso_data->use_syscall = use_syscall; - vdso_data->xtime_coarse_sec = xtime_coarse.tv_sec; - vdso_data->xtime_coarse_nsec = xtime_coarse.tv_nsec; + vdso_data->xtime_coarse_sec = tk->xtime_sec; + vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift; vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index f02530e726f6..85c57158dcd9 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -168,8 +168,8 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) inject_abt32(vcpu, false, addr); - - inject_abt64(vcpu, false, addr); + else + inject_abt64(vcpu, false, addr); } /** @@ -184,8 +184,8 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) inject_abt32(vcpu, true, addr); - - inject_abt64(vcpu, true, addr); + else + inject_abt64(vcpu, true, addr); } /** @@ -198,6 +198,6 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) inject_undef32(vcpu); - - inject_undef64(vcpu); + else + inject_undef64(vcpu); } diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c index 23b1a97fae7a..52c179bec0cc 100644 --- a/arch/avr32/mach-at32ap/clock.c +++ b/arch/avr32/mach-at32ap/clock.c @@ -80,6 +80,9 @@ int clk_enable(struct clk *clk) { unsigned long flags; + if (!clk) + return 0; + spin_lock_irqsave(&clk_lock, flags); __clk_enable(clk); spin_unlock_irqrestore(&clk_lock, flags); @@ -106,6 +109,9 @@ void clk_disable(struct clk *clk) { unsigned long flags; + if (IS_ERR_OR_NULL(clk)) + return; + spin_lock_irqsave(&clk_lock, flags); __clk_disable(clk); spin_unlock_irqrestore(&clk_lock, flags); @@ -117,6 +123,9 @@ unsigned long clk_get_rate(struct clk *clk) unsigned long flags; unsigned long rate; + if (!clk) + return 0; + spin_lock_irqsave(&clk_lock, flags); rate = clk->get_rate(clk); spin_unlock_irqrestore(&clk_lock, flags); @@ -129,6 +138,9 @@ long clk_round_rate(struct clk *clk, unsigned long rate) { unsigned long flags, actual_rate; + if (!clk) + return 0; + if (!clk->set_rate) return -ENOSYS; @@ -145,6 +157,9 @@ int clk_set_rate(struct clk *clk, unsigned long rate) unsigned long flags; long ret; + if (!clk) + return 0; + if (!clk->set_rate) return -ENOSYS; @@ -161,6 +176,9 @@ int clk_set_parent(struct clk *clk, struct clk *parent) unsigned long flags; int ret; + if (!clk) + return 0; + if (!clk->set_parent) return -ENOSYS; @@ -174,7 +192,7 @@ EXPORT_SYMBOL(clk_set_parent); struct clk *clk_get_parent(struct clk *clk) { - return clk->parent; + return !clk ? NULL : clk->parent; } EXPORT_SYMBOL(clk_get_parent); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cee5f93e5712..199a8357838c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -151,7 +151,6 @@ config BMIPS_GENERIC select BCM7120_L2_IRQ select BRCMSTB_L2_IRQ select IRQ_MIPS_CPU - select RAW_IRQ_ACCESSORS select DMA_NONCOHERENT select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_LITTLE_ENDIAN diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 01a644f174dd..1ba21204ebe0 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -190,6 +190,7 @@ int get_c0_perfcount_int(void) { return ATH79_MISC_IRQ(5); } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); unsigned int get_c0_compare_int(void) { diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 56f5d080ef9d..b7fa9ae28c36 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -42,7 +42,7 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) cvmx_write_csr(CVMX_CIU_MBOX_CLRX(coreid), action); if (action & SMP_CALL_FUNCTION) - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); if (action & SMP_RESCHEDULE_YOURSELF) scheduler_ipi(); diff --git a/arch/mips/include/asm/mach-bcm63xx/dma-coherence.h b/arch/mips/include/asm/mach-bcm63xx/dma-coherence.h deleted file mode 100644 index 11d3b572b1b3..000000000000 --- a/arch/mips/include/asm/mach-bcm63xx/dma-coherence.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __ASM_MACH_BCM63XX_DMA_COHERENCE_H -#define __ASM_MACH_BCM63XX_DMA_COHERENCE_H - -#include <asm/bmips.h> - -#define plat_post_dma_flush bmips_post_dma_flush - -#include <asm/mach-generic/dma-coherence.h> - -#endif /* __ASM_MACH_BCM63XX_DMA_COHERENCE_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9d8106758142..ae8569475264 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -182,8 +182,39 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) * Make sure the buddy is global too (if it's !none, * it better already be global) */ +#ifdef CONFIG_SMP + /* + * For SMP, multiple CPUs can race, so we need to do + * this atomically. + */ +#ifdef CONFIG_64BIT +#define LL_INSN "lld" +#define SC_INSN "scd" +#else /* CONFIG_32BIT */ +#define LL_INSN "ll" +#define SC_INSN "sc" +#endif + unsigned long page_global = _PAGE_GLOBAL; + unsigned long tmp; + + __asm__ __volatile__ ( + " .set push\n" + " .set noreorder\n" + "1: " LL_INSN " %[tmp], %[buddy]\n" + " bnez %[tmp], 2f\n" + " or %[tmp], %[tmp], %[global]\n" + " " SC_INSN " %[tmp], %[buddy]\n" + " beqz %[tmp], 1b\n" + " nop\n" + "2:\n" + " .set pop" + : [buddy] "+m" (buddy->pte), + [tmp] "=&r" (tmp) + : [global] "r" (page_global)); +#else /* !CONFIG_SMP */ if (pte_none(*buddy)) pte_val(*buddy) = pte_val(*buddy) | _PAGE_GLOBAL; +#endif /* CONFIG_SMP */ } #endif } diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index 16f1ea9ab191..03722d4326a1 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -83,8 +83,6 @@ static inline void __cpu_die(unsigned int cpu) extern void play_dead(void); #endif -extern asmlinkage void smp_call_function_interrupt(void); - static inline void arch_send_call_function_single_ipi(int cpu) { extern struct plat_smp_ops *mp_ops; /* private */ diff --git a/arch/mips/include/asm/stackframe.h b/arch/mips/include/asm/stackframe.h index 28d6d9364bd1..a71da576883c 100644 --- a/arch/mips/include/asm/stackframe.h +++ b/arch/mips/include/asm/stackframe.h @@ -152,6 +152,31 @@ .set noreorder bltz k0, 8f move k1, sp +#ifdef CONFIG_EVA + /* + * Flush interAptiv's Return Prediction Stack (RPS) by writing + * EntryHi. Toggling Config7.RPS is slower and less portable. + * + * The RPS isn't automatically flushed when exceptions are + * taken, which can result in kernel mode speculative accesses + * to user addresses if the RPS mispredicts. That's harmless + * when user and kernel share the same address space, but with + * EVA the same user segments may be unmapped to kernel mode, + * even containing sensitive MMIO regions or invalid memory. + * + * This can happen when the kernel sets the return address to + * ret_from_* and jr's to the exception handler, which looks + * more like a tail call than a function call. If nested calls + * don't evict the last user address in the RPS, it will + * mispredict the return and fetch from a user controlled + * address into the icache. + * + * More recent EVA-capable cores with MAAR to restrict + * speculative accesses aren't affected. + */ + MFC0 k0, CP0_ENTRYHI + MTC0 k0, CP0_ENTRYHI +#endif .set reorder /* Called from user mode, new stack. */ get_saved_sp diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index af42e7003f12..baa7b6fc0a60 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -407,7 +407,7 @@ NESTED(nmi_handler, PT_SIZE, sp) .set noat SAVE_ALL FEXPORT(handle_\exception\ext) - __BUILD_clear_\clear + __build_clear_\clear .set at __BUILD_\verbose \exception move a0, sp diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 3e4491aa6d6b..789d7bf4fef3 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -154,7 +154,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { unsigned int real_len; - cpumask_t mask; + cpumask_t allowed, mask; int retval; struct task_struct *p; @@ -173,7 +173,8 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, if (retval) goto out_unlock; - cpumask_and(&mask, &p->thread.user_cpus_allowed, cpu_possible_mask); + cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); + cpumask_and(&mask, &allowed, cpu_active_mask); out_unlock: read_unlock(&tasklist_lock); diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c index b130033838ba..5fcec3032f38 100644 --- a/arch/mips/kernel/prom.c +++ b/arch/mips/kernel/prom.c @@ -38,7 +38,7 @@ char *mips_get_machine_name(void) return mips_machine_name; } -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF void __init early_init_dt_add_memory_arch(u64 base, u64 size) { return add_memory_region(base, size, BOOT_MEM_RAM); diff --git a/arch/mips/kernel/relocate_kernel.S b/arch/mips/kernel/relocate_kernel.S index 74bab9ddd0e1..c6bbf2165051 100644 --- a/arch/mips/kernel/relocate_kernel.S +++ b/arch/mips/kernel/relocate_kernel.S @@ -24,7 +24,7 @@ LEAF(relocate_new_kernel) process_entry: PTR_L s2, (s0) - PTR_ADD s0, s0, SZREG + PTR_ADDIU s0, s0, SZREG /* * In case of a kdump/crash kernel, the indirection page is not @@ -61,9 +61,9 @@ copy_word: /* copy page word by word */ REG_L s5, (s2) REG_S s5, (s4) - PTR_ADD s4, s4, SZREG - PTR_ADD s2, s2, SZREG - LONG_SUB s6, s6, 1 + PTR_ADDIU s4, s4, SZREG + PTR_ADDIU s2, s2, SZREG + LONG_ADDIU s6, s6, -1 beq s6, zero, process_entry b copy_word b process_entry diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index ad4d44635c76..a6f6b762c47a 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -80,7 +80,7 @@ syscall_trace_entry: SAVE_STATIC move s0, t2 move a0, sp - daddiu a1, v0, __NR_64_Linux + move a1, v0 jal syscall_trace_enter bltz v0, 2f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 446cc654da56..4b2010654c46 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -72,7 +72,7 @@ n32_syscall_trace_entry: SAVE_STATIC move s0, t2 move a0, sp - daddiu a1, v0, __NR_N32_Linux + move a1, v0 jal syscall_trace_enter bltz v0, 2f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 19a7705f2a01..5d7f2634996f 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -409,8 +409,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { - memset(to, 0, sizeof *to); - if (copy_from_user(to, from, 3*sizeof(int)) || copy_from_user(to->_sifields._pad, from->_sifields._pad, SI_PAD_SIZE32)) diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c index 336708ae5c5b..78cf8c2f1de0 100644 --- a/arch/mips/kernel/smp-bmips.c +++ b/arch/mips/kernel/smp-bmips.c @@ -284,7 +284,7 @@ static irqreturn_t bmips5000_ipi_interrupt(int irq, void *dev_id) if (action == 0) scheduler_ipi(); else - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } @@ -336,7 +336,7 @@ static irqreturn_t bmips43xx_ipi_interrupt(int irq, void *dev_id) if (action & SMP_RESCHEDULE_YOURSELF) scheduler_ipi(); if (action & SMP_CALL_FUNCTION) - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index d0744cc77ea7..a31896c33716 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -192,16 +192,6 @@ asmlinkage void start_secondary(void) cpu_startup_entry(CPUHP_ONLINE); } -/* - * Call into both interrupt handlers, as we share the IPI for them - */ -void __irq_entry smp_call_function_interrupt(void) -{ - irq_enter(); - generic_smp_call_function_interrupt(); - irq_exit(); -} - static void stop_this_cpu(void *dummy) { /* diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e207a43b5f8f..8ea28e6ab37d 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -192,6 +192,7 @@ static void show_stacktrace(struct task_struct *task, void show_stack(struct task_struct *task, unsigned long *sp) { struct pt_regs regs; + mm_segment_t old_fs = get_fs(); if (sp) { regs.regs[29] = (unsigned long)sp; regs.regs[31] = 0; @@ -210,7 +211,13 @@ void show_stack(struct task_struct *task, unsigned long *sp) prepare_frametrace(®s); } } + /* + * show_stack() deals exclusively with kernel mode, so be sure to access + * the stack in the kernel (not user) address space. + */ + set_fs(KERNEL_DS); show_stacktrace(task, ®s); + set_fs(old_fs); } static void show_code(unsigned int __user *pc) @@ -1519,6 +1526,7 @@ asmlinkage void do_mcheck(struct pt_regs *regs) const int field = 2 * sizeof(unsigned long); int multi_match = regs->cp0_status & ST0_TS; enum ctx_state prev_state; + mm_segment_t old_fs = get_fs(); prev_state = exception_enter(); show_regs(regs); @@ -1540,8 +1548,13 @@ asmlinkage void do_mcheck(struct pt_regs *regs) dump_tlb_all(); } + if (!user_mode(regs)) + set_fs(KERNEL_DS); + show_code((unsigned int __user *) regs->cp0_epc); + set_fs(old_fs); + /* * Some chips may have other causes of machine check (e.g. SB1 * graduation timer) diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index af84bef0c90d..eb3efd137fd1 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -438,7 +438,7 @@ do { \ : "memory"); \ } while(0) -#define StoreDW(addr, value, res) \ +#define _StoreDW(addr, value, res) \ do { \ __asm__ __volatile__ ( \ ".set\tpush\n\t" \ diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c index 6ab10573490d..2c218c3bbca5 100644 --- a/arch/mips/lantiq/irq.c +++ b/arch/mips/lantiq/irq.c @@ -293,7 +293,7 @@ static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) { - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } @@ -466,6 +466,7 @@ int get_c0_perfcount_int(void) { return ltq_perfcount_irq; } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); unsigned int get_c0_compare_int(void) { diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c index 509877c6e9d9..1a4738a8f2d3 100644 --- a/arch/mips/loongson64/loongson-3/smp.c +++ b/arch/mips/loongson64/loongson-3/smp.c @@ -266,8 +266,11 @@ void loongson3_ipi_interrupt(struct pt_regs *regs) if (action & SMP_RESCHEDULE_YOURSELF) scheduler_ipi(); - if (action & SMP_CALL_FUNCTION) - smp_call_function_interrupt(); + if (action & SMP_CALL_FUNCTION) { + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); + } if (action & SMP_ASK_C0COUNT) { BUG_ON(cpu != 0); diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 77d96db8253c..aab218c36e0d 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -160,18 +160,18 @@ static inline void setup_protection_map(void) protection_map[1] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); protection_map[2] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); protection_map[3] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_READ); + protection_map[4] = __pgprot(_page_cachable_default | _PAGE_PRESENT); protection_map[5] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[6] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_READ); + protection_map[6] = __pgprot(_page_cachable_default | _PAGE_PRESENT); protection_map[7] = __pgprot(_page_cachable_default | _PAGE_PRESENT); protection_map[8] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); protection_map[9] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); protection_map[10] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); protection_map[11] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_READ); + protection_map[12] = __pgprot(_page_cachable_default | _PAGE_PRESENT); protection_map[13] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[14] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_WRITE | _PAGE_NO_READ); + protection_map[14] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_WRITE); protection_map[15] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_WRITE); } else { diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 36c0f26fac6b..852a41c6da45 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -133,7 +133,8 @@ good_area: #endif goto bad_area; } - if (!(vma->vm_flags & VM_READ)) { + if (!(vma->vm_flags & VM_READ) && + exception_epc(regs) != address) { #if 0 pr_notice("Cpu%d[%s:%d:%0*lx:%ld:%0*lx] RI violation\n", raw_smp_processor_id(), diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index d1392f8f5811..fa8f591f3713 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -222,7 +222,7 @@ static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) { - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 5625b190edc0..b7bf721eabf5 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -154,6 +154,7 @@ int get_c0_perfcount_int(void) return mips_cpu_perf_irq; } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); unsigned int get_c0_compare_int(void) { @@ -171,14 +172,17 @@ unsigned int get_c0_compare_int(void) static void __init init_rtc(void) { - /* stop the clock whilst setting it up */ - CMOS_WRITE(RTC_SET | RTC_24H, RTC_CONTROL); + unsigned char freq, ctrl; - /* 32KHz time base */ - CMOS_WRITE(RTC_REF_CLCK_32KHZ, RTC_FREQ_SELECT); + /* Set 32KHz time base if not already set */ + freq = CMOS_READ(RTC_FREQ_SELECT); + if ((freq & RTC_DIV_CTL) != RTC_REF_CLCK_32KHZ) + CMOS_WRITE(RTC_REF_CLCK_32KHZ, RTC_FREQ_SELECT); - /* start the clock */ - CMOS_WRITE(RTC_24H, RTC_CONTROL); + /* Ensure SET bit is clear so RTC can run */ + ctrl = CMOS_READ(RTC_CONTROL); + if (ctrl & RTC_SET) + CMOS_WRITE(ctrl & ~RTC_SET, RTC_CONTROL); } void __init plat_time_init(void) diff --git a/arch/mips/mti-sead3/sead3-time.c b/arch/mips/mti-sead3/sead3-time.c index e1d69895fb1d..a120b7a5a8fe 100644 --- a/arch/mips/mti-sead3/sead3-time.c +++ b/arch/mips/mti-sead3/sead3-time.c @@ -77,6 +77,7 @@ int get_c0_perfcount_int(void) return MIPS_CPU_IRQ_BASE + cp0_perfcount_irq; return -1; } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); unsigned int get_c0_compare_int(void) { diff --git a/arch/mips/netlogic/common/smp.c b/arch/mips/netlogic/common/smp.c index dc3e327fbbac..f5fff228b347 100644 --- a/arch/mips/netlogic/common/smp.c +++ b/arch/mips/netlogic/common/smp.c @@ -86,7 +86,7 @@ void nlm_smp_function_ipi_handler(unsigned int irq, struct irq_desc *desc) { clear_c0_eimr(irq); ack_c0_eirr(irq); - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); set_c0_eimr(irq); } diff --git a/arch/mips/paravirt/paravirt-smp.c b/arch/mips/paravirt/paravirt-smp.c index 42181c7105df..f8d3e081b2eb 100644 --- a/arch/mips/paravirt/paravirt-smp.c +++ b/arch/mips/paravirt/paravirt-smp.c @@ -114,7 +114,7 @@ static irqreturn_t paravirt_reched_interrupt(int irq, void *dev_id) static irqreturn_t paravirt_function_interrupt(int irq, void *dev_id) { - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c index 7c73fcb92a10..8a377346f0ca 100644 --- a/arch/mips/pistachio/time.c +++ b/arch/mips/pistachio/time.c @@ -26,6 +26,7 @@ int get_c0_perfcount_int(void) { return gic_get_c0_perfcount_int(); } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); int get_c0_fdc_int(void) { diff --git a/arch/mips/pmcs-msp71xx/msp_smp.c b/arch/mips/pmcs-msp71xx/msp_smp.c index 10170580a2de..ffa0f7101a97 100644 --- a/arch/mips/pmcs-msp71xx/msp_smp.c +++ b/arch/mips/pmcs-msp71xx/msp_smp.c @@ -44,7 +44,7 @@ static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) { - smp_call_function_interrupt(); + generic_smp_call_function_interrupt(); return IRQ_HANDLED; } diff --git a/arch/mips/ralink/irq.c b/arch/mips/ralink/irq.c index 53707aacc0f8..8c624a8b9ea2 100644 --- a/arch/mips/ralink/irq.c +++ b/arch/mips/ralink/irq.c @@ -89,6 +89,7 @@ int get_c0_perfcount_int(void) { return rt_perfcount_irq; } +EXPORT_SYMBOL_GPL(get_c0_perfcount_int); unsigned int get_c0_compare_int(void) { diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 3fbaef97a1b8..16ec4e12daa3 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -107,10 +107,14 @@ static void ip27_do_irq_mask0(void) scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); - smp_call_function_interrupt(); + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); } else if (pend0 & (1UL << CPU_CALL_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_B_IRQ); - smp_call_function_interrupt(); + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); } else #endif { diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index af7d44edd9a8..4c71aea25663 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -29,8 +29,6 @@ #include <asm/sibyte/bcm1480_regs.h> #include <asm/sibyte/bcm1480_int.h> -extern void smp_call_function_interrupt(void); - /* * These are routines for dealing with the bcm1480 smp capabilities * independent of board/firmware @@ -184,6 +182,9 @@ void bcm1480_mailbox_interrupt(void) if (action & SMP_RESCHEDULE_YOURSELF) scheduler_ipi(); - if (action & SMP_CALL_FUNCTION) - smp_call_function_interrupt(); + if (action & SMP_CALL_FUNCTION) { + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); + } } diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c0c4b3f88a08..1cf66f5ff23d 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -172,6 +172,9 @@ void sb1250_mailbox_interrupt(void) if (action & SMP_RESCHEDULE_YOURSELF) scheduler_ipi(); - if (action & SMP_CALL_FUNCTION) - smp_call_function_interrupt(); + if (action & SMP_CALL_FUNCTION) { + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); + } } diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 42e02a2d570b..efc3fa54c90b 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -191,6 +191,9 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, pci_device_add(dev, bus); + /* Setup MSI caps & disable MSI/MSI-X interrupts */ + pci_msi_setup_pci_dev(dev); + return dev; } EXPORT_SYMBOL(of_create_pci_dev); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index d3a831ac0f92..da50e0c9c57e 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -966,8 +966,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) { - memset(to, 0, sizeof *to); - if (copy_from_user(to, from, 3*sizeof(int)) || copy_from_user(to->_sifields._pad, from->_sifields._pad, SI_PAD_SIZE32)) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 5cf5e6ea213b..7cf0df859d05 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1478,7 +1478,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) } /* Unmask the event */ - if (eeh_enabled()) + if (ret == EEH_NEXT_ERR_NONE && eeh_enabled()) enable_irq(eeh_event_irq); return ret; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5738d315248b..85cbc96eff6c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2220,7 +2220,7 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, unsigned levels, unsigned long limit, - unsigned long *current_offset) + unsigned long *current_offset, unsigned long *total_allocated) { struct page *tce_mem = NULL; __be64 *addr, *tmp; @@ -2236,6 +2236,7 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, } addr = page_address(tce_mem); memset(addr, 0, allocated); + *total_allocated += allocated; --levels; if (!levels) { @@ -2245,7 +2246,7 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, for (i = 0; i < entries; ++i) { tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, - levels, limit, current_offset); + levels, limit, current_offset, total_allocated); if (!tmp) break; @@ -2267,7 +2268,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, struct iommu_table *tbl) { void *addr; - unsigned long offset = 0, level_shift; + unsigned long offset = 0, level_shift, total_allocated = 0; const unsigned window_shift = ilog2(window_size); unsigned entries_shift = window_shift - page_shift; unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); @@ -2286,7 +2287,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset); + levels, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -2308,7 +2309,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = offset; + tbl->it_allocated_size = total_allocated; pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", window_size, tce_table_size, bus_offset); diff --git a/arch/s390/include/asm/etr.h b/arch/s390/include/asm/etr.h index 629b79a93165..f7e5c36688c3 100644 --- a/arch/s390/include/asm/etr.h +++ b/arch/s390/include/asm/etr.h @@ -214,6 +214,9 @@ static inline int etr_ptff(void *ptff_block, unsigned int func) void etr_switch_to_local(void); void etr_sync_check(void); +/* notifier for syncs */ +extern struct atomic_notifier_head s390_epoch_delta_notifier; + /* STP interruption parameter */ struct stp_irq_parm { unsigned int _pad0 : 14; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3024acbe1f9d..df4db81254d3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -258,6 +258,9 @@ struct kvm_vcpu_stat { u32 diagnose_10; u32 diagnose_44; u32 diagnose_9c; + u32 diagnose_258; + u32 diagnose_308; + u32 diagnose_500; }; #define PGM_OPERATION 0x01 @@ -630,7 +633,6 @@ extern char sie_exit; static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_check_processor_compat(void *rtn) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index bff5e3b6d822..8ba32436effe 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -138,6 +138,8 @@ int init_cache_level(unsigned int cpu) union cache_topology ct; enum cache_type ctype; + if (!test_facility(34)) + return -EOPNOTSUPP; if (!this_cpu_ci) return -EINVAL; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 9e733d965e08..627887b075a7 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -58,6 +58,9 @@ EXPORT_SYMBOL_GPL(sched_clock_base_cc); static DEFINE_PER_CPU(struct clock_event_device, comparators); +ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); +EXPORT_SYMBOL(s390_epoch_delta_notifier); + /* * Scheduler clock - returns current time in nanosec units. */ @@ -752,7 +755,7 @@ static void clock_sync_cpu(struct clock_sync_data *sync) static int etr_sync_clock(void *data) { static int first; - unsigned long long clock, old_clock, delay, delta; + unsigned long long clock, old_clock, clock_delta, delay, delta; struct clock_sync_data *etr_sync; struct etr_aib *sync_port, *aib; int port; @@ -789,6 +792,9 @@ static int etr_sync_clock(void *data) delay = (unsigned long long) (aib->edf2.etv - sync_port->edf2.etv) << 32; delta = adjust_time(old_clock, clock, delay); + clock_delta = clock - old_clock; + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, + &clock_delta); etr_sync->fixup_cc = delta; fixup_clock_comparator(delta); /* Verify that the clock is properly set. */ @@ -1526,7 +1532,7 @@ void stp_island_check(void) static int stp_sync_clock(void *data) { static int first; - unsigned long long old_clock, delta; + unsigned long long old_clock, delta, new_clock, clock_delta; struct clock_sync_data *stp_sync; int rc; @@ -1551,7 +1557,11 @@ static int stp_sync_clock(void *data) old_clock = get_tod_clock(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0); if (rc == 0) { - delta = adjust_time(old_clock, get_tod_clock(), 0); + new_clock = get_tod_clock(); + delta = adjust_time(old_clock, new_clock, 0); + clock_delta = new_clock - old_clock; + atomic_notifier_call_chain(&s390_epoch_delta_notifier, + 0, &clock_delta); fixup_clock_comparator(delta); rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index fc7ec95848c3..5fbfb88f8477 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,13 +27,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end || start < 2 * PAGE_SIZE) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); - vcpu->stat.diagnose_10++; /* * We checked for start >= end above, so lets check for the @@ -75,6 +75,9 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); + VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx", + vcpu->run->s.regs.gprs[rx]); + vcpu->stat.diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); @@ -85,6 +88,9 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) switch (parm.subcode) { case 0: /* TOKEN */ + VCPU_EVENT(vcpu, 3, "pageref token addr 0x%llx " + "select mask 0x%llx compare mask 0x%llx", + parm.token_addr, parm.select_mask, parm.compare_mask); if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) { /* * If the pagefault handshake is already activated, @@ -114,6 +120,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) * the cancel, therefore to reduce code complexity, we assume * all outstanding tokens are already pending. */ + VCPU_EVENT(vcpu, 3, "pageref cancel addr 0x%llx", parm.token_addr); if (parm.token_addr || parm.select_mask || parm.compare_mask || parm.zarch) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -174,7 +181,8 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; - VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); + VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode); + vcpu->stat.diagnose_308++; switch (subcode) { case 3: vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; @@ -202,6 +210,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { int ret; + vcpu->stat.diagnose_500++; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index e97b3455d7e6..47518a324d75 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -473,10 +473,45 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->iprcc &= ~PGM_PER; } +#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) +#define hssec(vcpu) (vcpu->arch.sie_block->gcr[13] & _ASCE_SPACE_SWITCH) +#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) +#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) + void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) { + int new_as; + if (debug_exit_required(vcpu)) vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; filter_guest_per_event(vcpu); + + /* + * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger + * a space-switch event. PER events enforce space-switch events + * for these instructions. So if no PER event for the guest is left, + * we might have to filter the space-switch element out, too. + */ + if (vcpu->arch.sie_block->iprcc == PGM_SPACE_SWITCH) { + vcpu->arch.sie_block->iprcc = 0; + new_as = psw_bits(vcpu->arch.sie_block->gpsw).as; + + /* + * If the AS changed from / to home, we had RP, SAC or SACF + * instruction. Check primary and home space-switch-event + * controls. (theoretically home -> home produced no event) + */ + if (((new_as == PSW_AS_HOME) ^ old_as_is_home(vcpu)) && + (pssec(vcpu) || hssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + + /* + * PT, PTI, PR, PC instruction operate on primary AS only. Check + * if the primary-space-switch-event control was or got set. + */ + if (new_as == PSW_AS_PRIMARY && !old_as_is_home(vcpu) && + (pssec(vcpu) || old_ssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c98d89708e99..b277d50dcf76 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -30,7 +30,6 @@ #define IOINT_SCHID_MASK 0x0000ffff #define IOINT_SSID_MASK 0x00030000 #define IOINT_CSSID_MASK 0x03fc0000 -#define IOINT_AI_MASK 0x04000000 #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 @@ -72,9 +71,13 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { + preempt_disable(); if (!(vcpu->arch.sie_block->ckc < - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) + get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { + preempt_enable(); return 0; + } + preempt_enable(); return ckc_interrupts_enabled(vcpu); } @@ -311,8 +314,8 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) li->irq.ext.ext_params2 = 0; spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: pfault init parm:%x,parm64:%llx", - 0, ext.ext_params2); + VCPU_EVENT(vcpu, 4, "deliver: pfault init token 0x%llx", + ext.ext_params2); trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, 0, ext.ext_params2); @@ -368,7 +371,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) spin_unlock(&fi->lock); if (deliver) { - VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx", + VCPU_EVENT(vcpu, 3, "deliver: machine check mcic 0x%llx", mchk.mcic); trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK, @@ -403,7 +406,7 @@ static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc; - VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart"); + VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); @@ -427,7 +430,6 @@ static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", prefix.address); vcpu->stat.deliver_prefix_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, @@ -450,7 +452,7 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg"); + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp emerg"); vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); @@ -477,7 +479,7 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp ext call"); vcpu->stat.deliver_external_call++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, @@ -506,7 +508,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) memset(&li->irq.pgm, 0, sizeof(pgm_info)); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", + VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d", pgm_info.code, ilc); vcpu->stat.deliver_program_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, @@ -622,7 +624,7 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); spin_unlock(&fi->lock); - VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", + VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", ext.ext_params); vcpu->stat.deliver_service_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, @@ -651,9 +653,6 @@ static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) struct kvm_s390_interrupt_info, list); if (inti) { - trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, - KVM_S390_INT_PFAULT_DONE, 0, - inti->ext.ext_params2); list_del(&inti->list); fi->counters[FIRQ_CNTR_PFAULT] -= 1; } @@ -662,6 +661,12 @@ static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) spin_unlock(&fi->lock); if (inti) { + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_PFAULT_DONE, 0, + inti->ext.ext_params2); + VCPU_EVENT(vcpu, 4, "deliver: pfault done token 0x%llx", + inti->ext.ext_params2); + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, (u16 *)__LC_EXT_INT_CODE); rc |= put_guest_lc(vcpu, PFAULT_DONE, @@ -691,7 +696,7 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu) list); if (inti) { VCPU_EVENT(vcpu, 4, - "interrupt: virtio parm:%x,parm64:%llx", + "deliver: virtio parm: 0x%x,parm64: 0x%llx", inti->ext.ext_params, inti->ext.ext_params2); vcpu->stat.deliver_virtio_interrupt++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, @@ -741,7 +746,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info, list); if (inti) { - VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type); + VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type); vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -855,7 +860,9 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) goto no_timer; } + preempt_disable(); now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; + preempt_enable(); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* underflow */ @@ -864,7 +871,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) __set_cpu_idle(vcpu); hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); - VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); + VCPU_EVENT(vcpu, 4, "enabled wait via clock comparator: %llu ns", sltime); no_timer: srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); @@ -894,7 +901,9 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + preempt_disable(); now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; + preempt_enable(); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* @@ -968,6 +977,10 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, + irq->u.pgm.code, 0); + li->irq.pgm = irq->u.pgm; set_bit(IRQ_PEND_PROG, &li->pending_irqs); return 0; @@ -978,9 +991,6 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_irq irq; - VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, code, - 0, 1); spin_lock(&li->lock); irq.u.pgm.code = code; __inject_prog(vcpu, &irq); @@ -996,10 +1006,6 @@ int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, struct kvm_s390_irq irq; int rc; - VCPU_EVENT(vcpu, 3, "inject: prog irq %d (from kernel)", - pgm_info->code); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, - pgm_info->code, 0, 1); spin_lock(&li->lock); irq.u.pgm = *pgm_info; rc = __inject_prog(vcpu, &irq); @@ -1012,11 +1018,11 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: external irq params:%x, params2:%llx", - irq->u.ext.ext_params, irq->u.ext.ext_params2); + VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx", + irq->u.ext.ext_params2); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, irq->u.ext.ext_params, - irq->u.ext.ext_params2, 2); + irq->u.ext.ext_params2); li->irq.ext = irq->u.ext; set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); @@ -1045,10 +1051,10 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_extcall_info *extcall = &li->irq.extcall; uint16_t src_id = irq->u.extcall.code; - VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", + VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u", src_id); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, - src_id, 0, 2); + src_id, 0); /* sending vcpu invalid */ if (src_id >= KVM_MAX_VCPUS || @@ -1070,10 +1076,10 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_prefix_info *prefix = &li->irq.prefix; - VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)", + VCPU_EVENT(vcpu, 3, "inject: set prefix to %x", irq->u.prefix.address); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, - irq->u.prefix.address, 0, 2); + irq->u.prefix.address, 0); if (!is_vcpu_stopped(vcpu)) return -EBUSY; @@ -1090,7 +1096,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_stop_info *stop = &li->irq.stop; int rc = 0; - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0); if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS) return -EINVAL; @@ -1114,8 +1120,8 @@ static int __inject_sigp_restart(struct kvm_vcpu *vcpu, { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: restart type %llx", irq->type); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0, 2); + VCPU_EVENT(vcpu, 3, "%s", "inject: restart int"); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); set_bit(IRQ_PEND_RESTART, &li->pending_irqs); return 0; @@ -1126,10 +1132,10 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", + VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u", irq->u.emerg.code); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, - irq->u.emerg.code, 0, 2); + irq->u.emerg.code, 0); set_bit(irq->u.emerg.code, li->sigp_emerg_pending); set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); @@ -1142,10 +1148,10 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_mchk_info *mchk = &li->irq.mchk; - VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx", + VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx", irq->u.mchk.mcic); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, - irq->u.mchk.mcic, 2); + irq->u.mchk.mcic); /* * Because repressible machine checks can be indicated along with @@ -1172,9 +1178,9 @@ static int __inject_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: type %x", KVM_S390_INT_CLOCK_COMP); + VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external"); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, - 0, 0, 2); + 0, 0); set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); @@ -1185,9 +1191,9 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: type %x", KVM_S390_INT_CPU_TIMER); + VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external"); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, - 0, 0, 2); + 0, 0); set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); @@ -1435,20 +1441,20 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_INT_SERVICE: - VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm); + VM_EVENT(kvm, 4, "inject: sclp parm:%x", s390int->parm); inti->ext.ext_params = s390int->parm; break; case KVM_S390_INT_PFAULT_DONE: inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_MCHK: - VM_EVENT(kvm, 5, "inject: machine check parm64:%llx", + VM_EVENT(kvm, 3, "inject: machine check mcic 0x%llx", s390int->parm64); inti->mchk.cr14 = s390int->parm; /* upper bits are not used */ inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (inti->type & IOINT_AI_MASK) + if (inti->type & KVM_S390_INT_IO_AI_MASK) VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); else VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", @@ -1535,8 +1541,6 @@ static int do_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) switch (irq->type) { case KVM_S390_PROGRAM_INT: - VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)", - irq->u.pgm.code); rc = __inject_prog(vcpu, irq); break; case KVM_S390_SIGP_SET_PREFIX: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2078f92d15ac..6861b74649ae 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -28,6 +28,7 @@ #include <linux/vmalloc.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> +#include <asm/etr.h> #include <asm/pgtable.h> #include <asm/nmi.h> #include <asm/switch_to.h> @@ -108,6 +109,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "diagnose_10", VCPU_STAT(diagnose_10) }, { "diagnose_44", VCPU_STAT(diagnose_44) }, { "diagnose_9c", VCPU_STAT(diagnose_9c) }, + { "diagnose_258", VCPU_STAT(diagnose_258) }, + { "diagnose_308", VCPU_STAT(diagnose_308) }, + { "diagnose_500", VCPU_STAT(diagnose_500) }, { NULL } }; @@ -124,6 +128,7 @@ unsigned long kvm_s390_fac_list_mask_size(void) } static struct gmap_notifier gmap_notifier; +debug_info_t *kvm_s390_dbf; /* Section: not file related */ int kvm_arch_hardware_enable(void) @@ -134,24 +139,69 @@ int kvm_arch_hardware_enable(void) static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +/* + * This callback is executed during stop_machine(). All CPUs are therefore + * temporarily stopped. In order not to change guest behavior, we have to + * disable preemption whenever we touch the epoch of kvm and the VCPUs, + * so a CPU won't be stopped while calculating with the epoch. + */ +static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, + void *v) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + unsigned long long *delta = v; + + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.epoch -= *delta; + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.sie_block->epoch -= *delta; + } + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_clock_notifier = { + .notifier_call = kvm_clock_sync, +}; + int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; gmap_register_ipte_notifier(&gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; } void kvm_arch_hardware_unsetup(void) { gmap_unregister_ipte_notifier(&gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); } int kvm_arch_init(void *opaque) { + kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf) + return -ENOMEM; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { + debug_unregister(kvm_s390_dbf); + return -ENOMEM; + } + /* Register floating interrupt controller interface. */ return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } +void kvm_arch_exit(void) +{ + debug_unregister(kvm_s390_dbf); +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -281,10 +331,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) switch (cap->cap) { case KVM_CAP_S390_IRQCHIP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_IRQCHIP"); kvm->arch.use_irqchip = 1; r = 0; break; case KVM_CAP_S390_USER_SIGP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_SIGP"); kvm->arch.user_sigp = 1; r = 0; break; @@ -295,8 +347,11 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = 0; } else r = -EINVAL; + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_VECTOR_REGISTERS %s", + r ? "(not available)" : "(success)"); break; case KVM_CAP_S390_USER_STSI: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI"); kvm->arch.user_stsi = 1; r = 0; break; @@ -314,6 +369,8 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att switch (attr->attr) { case KVM_S390_VM_MEM_LIMIT_SIZE: ret = 0; + VM_EVENT(kvm, 3, "QUERY: max guest memory: %lu bytes", + kvm->arch.gmap->asce_end); if (put_user(kvm->arch.gmap->asce_end, (u64 __user *)attr->addr)) ret = -EFAULT; break; @@ -330,7 +387,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: + /* enable CMMA only for z10 and later (EDAT_1) */ + ret = -EINVAL; + if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1) + break; + ret = -EBUSY; + VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); if (atomic_read(&kvm->online_vcpus) == 0) { kvm->arch.use_cmma = 1; @@ -339,6 +402,11 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_unlock(&kvm->lock); break; case KVM_S390_VM_MEM_CLR_CMMA: + ret = -EINVAL; + if (!kvm->arch.use_cmma) + break; + + VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); s390_reset_cmma(kvm->arch.gmap->mm); @@ -374,6 +442,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } } mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: max guest memory: %lu bytes", new_limit); break; } default: @@ -400,22 +469,26 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) kvm->arch.crypto.crycb->aes_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); kvm->arch.crypto.aes_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: get_random_bytes( kvm->arch.crypto.crycb->dea_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); kvm->arch.crypto.dea_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: kvm->arch.crypto.aes_kw = 0; memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: kvm->arch.crypto.dea_kw = 0; memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support"); break; default: mutex_unlock(&kvm->lock); @@ -440,6 +513,7 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (gtod_high != 0) return -EINVAL; + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); return 0; } @@ -459,12 +533,15 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) return r; mutex_lock(&kvm->lock); + preempt_disable(); kvm->arch.epoch = gtod - host_tod; kvm_s390_vcpu_block_all(kvm); kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); return 0; } @@ -496,6 +573,7 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_to_user((void __user *)attr->addr, >od_high, sizeof(gtod_high))) return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); return 0; } @@ -509,9 +587,12 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) if (r) return r; + preempt_disable(); gtod = host_tod + kvm->arch.epoch; + preempt_enable(); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); return 0; } @@ -821,7 +902,9 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - s390_enable_skey(); + r = s390_enable_skey(); + if (r) + goto out; for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -879,8 +962,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.use_irqchip) { /* Set up dummy routing. */ memset(&routing, 0, sizeof(routing)); - kvm_set_irq_routing(kvm, &routing, 0, 0); - r = 0; + r = kvm_set_irq_routing(kvm, &routing, 0, 0); } break; } @@ -1043,7 +1125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) sprintf(debug_name, "kvm-%u", current->pid); - kvm->arch.dbf = debug_register(debug_name, 8, 2, 8 * sizeof(long)); + kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) goto out_err; @@ -1086,7 +1168,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.ipte_mutex); debug_register_view(kvm->arch.dbf, &debug_sprintf_view); - VM_EVENT(kvm, 3, "%s", "vm created"); + VM_EVENT(kvm, 3, "vm created with type %lu", type); if (type & KVM_VM_S390_UCONTROL) { kvm->arch.gmap = NULL; @@ -1103,6 +1185,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -1110,6 +1193,7 @@ out_err: free_page((unsigned long)kvm->arch.model.fac); debug_unregister(kvm->arch.dbf); free_page((unsigned long)(kvm->arch.sca)); + KVM_EVENT(3, "creation of vm failed: %d", rc); return rc; } @@ -1131,7 +1215,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_is_ucontrol(vcpu->kvm)) gmap_free(vcpu->arch.gmap); - if (kvm_s390_cmma_enabled(vcpu->kvm)) + if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); free_page((unsigned long)(vcpu->arch.sie_block)); @@ -1166,6 +1250,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) gmap_free(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ @@ -1264,7 +1349,9 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { mutex_lock(&vcpu->kvm->lock); + preempt_disable(); vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; + preempt_enable(); mutex_unlock(&vcpu->kvm->lock); if (!kvm_is_ucontrol(vcpu->kvm)) vcpu->arch.gmap = vcpu->kvm->arch.gmap; @@ -1342,7 +1429,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; - if (kvm_s390_cmma_enabled(vcpu->kvm)) { + if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); if (rc) return rc; @@ -1723,18 +1810,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return rc; } -bool kvm_s390_cmma_enabled(struct kvm *kvm) -{ - if (!MACHINE_IS_LPAR) - return false; - /* only enable for z10 and later */ - if (!MACHINE_HAS_EDAT1) - return false; - if (!kvm->arch.use_cmma) - return false; - return true; -} - static bool ibs_enabled(struct kvm_vcpu *vcpu) { return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS; @@ -1742,10 +1817,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { - if (!vcpu->requests) - return 0; retry: kvm_s390_vcpu_request_handled(vcpu); + if (!vcpu->requests) + return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the * guest prefix page. gmap_ipte_notify will wait on the ptl lock. @@ -2340,6 +2415,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_S390_CSS_SUPPORT: if (!vcpu->kvm->arch.css_support) { vcpu->kvm->arch.css_support = 1; + VM_EVENT(vcpu->kvm, 3, "%s", "ENABLE: CSS support"); trace_kvm_s390_enable_css(vcpu->kvm); } r = 0; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c5704786e473..c446aabf60d3 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -27,6 +27,13 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); #define TDB_FORMAT1 1 #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +extern debug_info_t *kvm_s390_dbf; +#define KVM_EVENT(d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ + d_args); \ +} while (0) + #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(d_kvm->arch.dbf, d_loglevel, d_string "\n", \ @@ -65,6 +72,8 @@ static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) { + VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id, + prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); @@ -217,8 +226,6 @@ void exit_sie(struct kvm_vcpu *vcpu); void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); -/* is cmma enabled */ -bool kvm_s390_cmma_enabled(struct kvm *kvm); unsigned long kvm_s390_fac_list_mask_size(void); extern unsigned long kvm_s390_fac_list_mask[]; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ad4242245771..4d21dc4d1a84 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -53,11 +53,14 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) kvm_s390_set_psw_cc(vcpu, 3); return 0; } + VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); val = (val - hostclk) & ~0x3fUL; mutex_lock(&vcpu->kvm->lock); + preempt_disable(); kvm_for_each_vcpu(i, cpup, vcpu->kvm) cpup->arch.sie_block->epoch = val; + preempt_enable(); mutex_unlock(&vcpu->kvm->lock); kvm_s390_set_psw_cc(vcpu, 0); @@ -98,8 +101,6 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); kvm_s390_set_prefix(vcpu, address); - - VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); trace_kvm_s390_handle_prefix(vcpu, 1, address); return 0; } @@ -129,7 +130,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); + VCPU_EVENT(vcpu, 3, "STPX: storing prefix 0x%x into 0x%llx", address, operand2); trace_kvm_s390_handle_prefix(vcpu, 0, address); return 0; } @@ -155,7 +156,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", ga); + VCPU_EVENT(vcpu, 3, "STAP: storing cpu address (%u) to 0x%llx", vcpu_id, ga); trace_kvm_s390_handle_stap(vcpu, ga); return 0; } @@ -167,6 +168,7 @@ static int __skey_check_enable(struct kvm_vcpu *vcpu) return rc; rc = s390_enable_skey(); + VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest"); trace_kvm_s390_skey_related_inst(vcpu); vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); return rc; @@ -370,7 +372,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu) &fac, sizeof(fac)); if (rc) return rc; - VCPU_EVENT(vcpu, 5, "store facility list value %x", fac); + VCPU_EVENT(vcpu, 3, "STFL: store facility list 0x%x", fac); trace_kvm_s390_handle_stfl(vcpu, fac); return 0; } @@ -468,7 +470,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); + VCPU_EVENT(vcpu, 3, "STIDP: store cpu id 0x%llx", stidp_data); return 0; } @@ -521,7 +523,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) ar_t ar; vcpu->stat.instruction_stsi++; - VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); + VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -758,10 +760,10 @@ static int handle_essa(struct kvm_vcpu *vcpu) struct gmap *gmap; int i; - VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries); + VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; - if (!kvm_s390_cmma_enabled(vcpu->kvm)) + if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) @@ -829,7 +831,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) if (ga & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "LCTL: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga); nr_regs = ((reg3 - reg1) & 0xf) + 1; @@ -868,7 +870,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) if (ga & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "stctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "STCTL r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_stctl(vcpu, 0, reg1, reg3, ga); reg = reg1; @@ -902,7 +904,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) if (ga & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "LCTLG: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga); nr_regs = ((reg3 - reg1) & 0xf) + 1; @@ -940,7 +942,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu) if (ga & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "stctg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "STCTG r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_stctl(vcpu, 1, reg1, reg3, ga); reg = reg1; diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 72e58bd2bee7..da690b69f9fe 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -205,9 +205,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; return SIGP_CC_STATUS_STORED; - } else if (rc == 0) { - VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", - dst_vcpu->vcpu_id, irq.u.prefix.address); } return rc; @@ -371,7 +368,8 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, return rc; } -static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code) +static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code, + u16 cpu_addr) { if (!vcpu->kvm->arch.user_sigp) return 0; @@ -414,9 +412,8 @@ static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code) default: vcpu->stat.instruction_sigp_unknown++; } - - VCPU_EVENT(vcpu, 4, "sigp order %u: completely handled in user space", - order_code); + VCPU_EVENT(vcpu, 3, "SIGP: order %u for CPU %d handled in userspace", + order_code, cpu_addr); return 1; } @@ -435,7 +432,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - if (handle_sigp_order_in_user_space(vcpu, order_code)) + if (handle_sigp_order_in_user_space(vcpu, order_code, cpu_addr)) return -EOPNOTSUPP; if (r1 % 2) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 3208d33a48cb..cc1d6c68356f 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -105,11 +105,22 @@ TRACE_EVENT(kvm_s390_vcpu_start_stop, {KVM_S390_PROGRAM_INT, "program interrupt"}, \ {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \ {KVM_S390_RESTART, "sigp restart"}, \ + {KVM_S390_INT_PFAULT_INIT, "pfault init"}, \ + {KVM_S390_INT_PFAULT_DONE, "pfault done"}, \ + {KVM_S390_MCHK, "machine check"}, \ + {KVM_S390_INT_CLOCK_COMP, "clock comparator"}, \ + {KVM_S390_INT_CPU_TIMER, "cpu timer"}, \ {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \ {KVM_S390_INT_SERVICE, "sclp interrupt"}, \ {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \ {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"} +#define get_irq_name(__type) \ + (__type > KVM_S390_INT_IO_MAX ? \ + __print_symbolic(__type, kvm_s390_int_type) : \ + (__type & KVM_S390_INT_IO_AI_MASK ? \ + "adapter I/O interrupt" : "subchannel I/O interrupt")) + TRACE_EVENT(kvm_s390_inject_vm, TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who), TP_ARGS(type, parm, parm64, who), @@ -131,22 +142,19 @@ TRACE_EVENT(kvm_s390_inject_vm, TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx", (__entry->who == 1) ? " (from kernel)" : (__entry->who == 2) ? " (from user)" : "", - __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), + __entry->inttype, get_irq_name(__entry->inttype), __entry->parm, __entry->parm64) ); TRACE_EVENT(kvm_s390_inject_vcpu, - TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \ - int who), - TP_ARGS(id, type, parm, parm64, who), + TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64), + TP_ARGS(id, type, parm, parm64), TP_STRUCT__entry( __field(int, id) __field(__u32, inttype) __field(__u32, parm) __field(__u64, parm64) - __field(int, who) ), TP_fast_assign( @@ -154,15 +162,12 @@ TRACE_EVENT(kvm_s390_inject_vcpu, __entry->inttype = type & 0x00000000ffffffff; __entry->parm = parm; __entry->parm64 = parm64; - __entry->who = who; ), - TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx", - (__entry->who == 1) ? " (from kernel)" : - (__entry->who == 2) ? " (from user)" : "", + TP_printk("inject (vcpu %d): type:%x (%s) parm:%x parm64:%llx", __entry->id, __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), - __entry->parm, __entry->parm64) + get_irq_name(__entry->inttype), __entry->parm, + __entry->parm64) ); /* @@ -189,8 +194,8 @@ TRACE_EVENT(kvm_s390_deliver_interrupt, TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ "data:%08llx %016llx", __entry->id, __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), - __entry->data0, __entry->data1) + get_irq_name(__entry->inttype), __entry->data0, + __entry->data1) ); /* diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fee782acc2ee..8d2e5165865f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -448,13 +448,13 @@ static void bpf_jit_prologue(struct bpf_jit *jit) EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, BPF_REG_1, offsetof(struct sk_buff, data)); } - /* BPF compatibility: clear A (%b7) and X (%b8) registers */ - if (REG_SEEN(BPF_REG_7)) - /* lghi %b7,0 */ - EMIT4_IMM(0xa7090000, BPF_REG_7, 0); - if (REG_SEEN(BPF_REG_8)) - /* lghi %b8,0 */ - EMIT4_IMM(0xa7090000, BPF_REG_8, 0); + /* BPF compatibility: clear A (%b0) and X (%b7) registers */ + if (REG_SEEN(BPF_REG_A)) + /* lghi %ba,0 */ + EMIT4_IMM(0xa7090000, BPF_REG_A, 0); + if (REG_SEEN(BPF_REG_X)) + /* lghi %bx,0 */ + EMIT4_IMM(0xa7090000, BPF_REG_X, 0); } /* diff --git a/arch/sparc/include/asm/visasm.h b/arch/sparc/include/asm/visasm.h index 1f0aa2024e94..6424249d5f78 100644 --- a/arch/sparc/include/asm/visasm.h +++ b/arch/sparc/include/asm/visasm.h @@ -28,16 +28,10 @@ * Must preserve %o5 between VISEntryHalf and VISExitHalf */ #define VISEntryHalf \ - rd %fprs, %o5; \ - andcc %o5, FPRS_FEF, %g0; \ - be,pt %icc, 297f; \ - sethi %hi(298f), %g7; \ - sethi %hi(VISenterhalf), %g1; \ - jmpl %g1 + %lo(VISenterhalf), %g0; \ - or %g7, %lo(298f), %g7; \ - clr %o5; \ -297: wr %o5, FPRS_FEF, %fprs; \ -298: + VISEntry + +#define VISExitHalf \ + VISExit #define VISEntryHalfFast(fail_label) \ rd %fprs, %o5; \ @@ -47,7 +41,7 @@ ba,a,pt %xcc, fail_label; \ 297: wr %o5, FPRS_FEF, %fprs; -#define VISExitHalf \ +#define VISExitHalfFast \ wr %o5, 0, %fprs; #ifndef __ASSEMBLY__ diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S index 140527a20e7d..83aeeb1dffdb 100644 --- a/arch/sparc/lib/NG4memcpy.S +++ b/arch/sparc/lib/NG4memcpy.S @@ -240,8 +240,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ add %o0, 0x40, %o0 bne,pt %icc, 1b LOAD(prefetch, %g1 + 0x200, #n_reads_strong) +#ifdef NON_USER_COPY + VISExitHalfFast +#else VISExitHalf - +#endif brz,pn %o2, .Lexit cmp %o2, 19 ble,pn %icc, .Lsmall_unaligned diff --git a/arch/sparc/lib/VISsave.S b/arch/sparc/lib/VISsave.S index b320ae9e2e2e..a063d84336d6 100644 --- a/arch/sparc/lib/VISsave.S +++ b/arch/sparc/lib/VISsave.S @@ -44,9 +44,8 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3 stx %g3, [%g6 + TI_GSR] 2: add %g6, %g1, %g3 - cmp %o5, FPRS_DU - be,pn %icc, 6f - sll %g1, 3, %g1 + mov FPRS_DU | FPRS_DL | FPRS_FEF, %o5 + sll %g1, 3, %g1 stb %o5, [%g3 + TI_FPSAVED] rd %gsr, %g2 add %g6, %g1, %g3 @@ -80,65 +79,3 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3 .align 32 80: jmpl %g7 + %g0, %g0 nop - -6: ldub [%g3 + TI_FPSAVED], %o5 - or %o5, FPRS_DU, %o5 - add %g6, TI_FPREGS+0x80, %g2 - stb %o5, [%g3 + TI_FPSAVED] - - sll %g1, 5, %g1 - add %g6, TI_FPREGS+0xc0, %g3 - wr %g0, FPRS_FEF, %fprs - membar #Sync - stda %f32, [%g2 + %g1] ASI_BLK_P - stda %f48, [%g3 + %g1] ASI_BLK_P - membar #Sync - ba,pt %xcc, 80f - nop - - .align 32 -80: jmpl %g7 + %g0, %g0 - nop - - .align 32 -VISenterhalf: - ldub [%g6 + TI_FPDEPTH], %g1 - brnz,a,pn %g1, 1f - cmp %g1, 1 - stb %g0, [%g6 + TI_FPSAVED] - stx %fsr, [%g6 + TI_XFSR] - clr %o5 - jmpl %g7 + %g0, %g0 - wr %g0, FPRS_FEF, %fprs - -1: bne,pn %icc, 2f - srl %g1, 1, %g1 - ba,pt %xcc, vis1 - sub %g7, 8, %g7 -2: addcc %g6, %g1, %g3 - sll %g1, 3, %g1 - andn %o5, FPRS_DU, %g2 - stb %g2, [%g3 + TI_FPSAVED] - - rd %gsr, %g2 - add %g6, %g1, %g3 - stx %g2, [%g3 + TI_GSR] - add %g6, %g1, %g2 - stx %fsr, [%g2 + TI_XFSR] - sll %g1, 5, %g1 -3: andcc %o5, FPRS_DL, %g0 - be,pn %icc, 4f - add %g6, TI_FPREGS, %g2 - - add %g6, TI_FPREGS+0x40, %g3 - membar #Sync - stda %f0, [%g2 + %g1] ASI_BLK_P - stda %f16, [%g3 + %g1] ASI_BLK_P - membar #Sync - ba,pt %xcc, 4f - nop - - .align 32 -4: and %o5, FPRS_DU, %o5 - jmpl %g7 + %g0, %g0 - wr %o5, FPRS_FEF, %fprs diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c index 1d649a95660c..8069ce12f20b 100644 --- a/arch/sparc/lib/ksyms.c +++ b/arch/sparc/lib/ksyms.c @@ -135,10 +135,6 @@ EXPORT_SYMBOL(copy_user_page); void VISenter(void); EXPORT_SYMBOL(VISenter); -/* CRYPTO code needs this */ -void VISenterhalf(void); -EXPORT_SYMBOL(VISenterhalf); - extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *); extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *, unsigned long *); diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index e8c2c04143cd..c667e104a0c2 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -113,8 +113,6 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) if (!access_ok(VERIFY_READ, from, sizeof(struct compat_siginfo))) return -EFAULT; - memset(to, 0, sizeof(*to)); - err = __get_user(to->si_signo, &from->si_signo); err |= __get_user(to->si_errno, &from->si_errno); err |= __get_user(to->si_code, &from->si_code); diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 2c82bd150d43..7d69afd8b6fa 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1193,6 +1193,10 @@ static efi_status_t setup_e820(struct boot_params *params, unsigned int e820_type = 0; unsigned long m = efi->efi_memmap; +#ifdef CONFIG_X86_64 + m |= (u64)efi->efi_memmap_hi << 32; +#endif + d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); switch (d->type) { case EFI_RESERVED_TYPE: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 5a1844765a7a..a7e257d9cb90 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -140,6 +140,7 @@ sysexit_from_sys_call: */ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ + movq RAX(%rsp), %rax RESTORE_RSI_RDI xorl %edx, %edx /* Do not leak kernel information */ xorq %r8, %r8 @@ -219,7 +220,6 @@ sysexit_from_sys_call: 1: setbe %al /* 1 if error, 0 if not */ movzbl %al, %edi /* zero-extend that into %edi */ call __audit_syscall_exit - movq RAX(%rsp), %rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -368,6 +368,7 @@ sysretl_from_sys_call: RESTORE_RSI_RDI_RDX movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d + movq RAX(%rsp), %rax xorq %r10, %r10 xorq %r9, %r9 xorq %r8, %r8 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a0bf89fd2647..4e10d73cf018 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -280,21 +280,6 @@ static inline void clear_LDT(void) set_ldt(NULL, 0); } -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc) -{ - set_ldt(pc->ldt, pc->size); -} - -static inline void load_LDT(mm_context_t *pc) -{ - preempt_disable(); - load_LDT_nolock(pc); - preempt_enable(); -} - static inline unsigned long get_desc_base(const struct desc_struct *desc) { return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49ec9038ec14..c12e845f59e6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -252,6 +252,11 @@ struct kvm_pio_request { int size; }; +struct rsvd_bits_validate { + u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -289,8 +294,15 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; - u64 rsvd_bits_mask[2][4]; - u64 bad_mt_xwr; + + /* + * check zero bits on shadow page table entries, these + * bits include not only hardware reserved bits but also + * the bits spte never used. + */ + struct rsvd_bits_validate shadow_zero_check; + + struct rsvd_bits_validate guest_rsvd_check; /* * Bitmap: bit set = last pte in walk @@ -358,6 +370,11 @@ struct kvm_mtrr { struct list_head head; }; +/* Hyper-V per vcpu emulation context */ +struct kvm_vcpu_hv { + u64 hv_vapic; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -514,8 +531,7 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - u64 hv_vapic; + struct kvm_vcpu_hv hyperv; cpumask_var_t wbinvd_dirty_mask; @@ -586,6 +602,17 @@ struct kvm_apic_map { struct kvm_lapic *logical_map[16][16]; }; +/* Hyper-V emulation context */ +struct kvm_hv { + u64 hv_guest_os_id; + u64 hv_hypercall; + u64 hv_tsc_page; + + /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ + u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; + u64 hv_crash_ctl; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -645,16 +672,14 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; - /* fields used by HYPER-V emulation */ - u64 hv_guest_os_id; - u64 hv_hypercall; - u64 hv_tsc_page; + struct kvm_hv hyperv; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif bool boot_vcpu_runs_old_kvmclock; + u32 bsp_vcpu_id; u64 disabled_quirks; }; @@ -1203,5 +1228,7 @@ int __x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); int x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 09b9620a73b4..364d27481a52 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,8 +9,7 @@ * we put the segment information here. */ typedef struct { - void *ldt; - int size; + struct ldt_struct *ldt; #ifdef CONFIG_X86_64 /* True if mm supports a task running in 32 bit compatibility mode. */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 804a3a6030ca..984abfe47edc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -34,6 +34,50 @@ static inline void load_mm_cr4(struct mm_struct *mm) {} #endif /* + * ldt_structs can be allocated, used, and freed, but they are never + * modified while live. + */ +struct ldt_struct { + /* + * Xen requires page-aligned LDTs with special permissions. This is + * needed to prevent us from installing evil descriptors such as + * call gates. On native, we could merge the ldt_struct and LDT + * allocations, but it's not worth trying to optimize. + */ + struct desc_struct *entries; + int size; +}; + +static inline void load_mm_ldt(struct mm_struct *mm) +{ + struct ldt_struct *ldt; + + /* lockless_dereference synchronizes with smp_store_release */ + ldt = lockless_dereference(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all + * CPUs with the mm active. The LDT will not be freed until + * after the IPI is handled by all such CPUs. This means that, + * if the ldt_struct changes before we return, the values we see + * will be safe, and the new values will be loaded before we run + * any user code. + * + * NB: don't try to convert this to use RCU without extreme care. + * We would still need IRQs off, because we don't want to change + * the local LDT after an IPI loaded a newer value than the one + * that we can see. + */ + + if (unlikely(ldt)) + set_ldt(ldt->entries, ldt->size); + else + clear_LDT(); + + DEBUG_LOCKS_WARN_ON(preemptible()); +} + +/* * Used for LDT copy/destruction. */ int init_new_context(struct task_struct *tsk, struct mm_struct *mm); @@ -78,12 +122,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * was called and then modify_ldt changed * prev->context.ldt but suppressed an IPI to this CPU. * In this case, prev->context.ldt != NULL, because we - * never free an LDT while the mm still exists. That - * means that next->context.ldt != prev->context.ldt, - * because mms never share an LDT. + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. */ if (unlikely(prev->context.ldt != next->context.ldt)) - load_LDT_nolock(&next->context); + load_mm_ldt(next); } #ifdef CONFIG_SMP else { @@ -106,7 +150,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_mm_cr4(next); - load_LDT_nolock(&next->context); + load_mm_ldt(next); } } #endif diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index c163215abb9a..aaf59b7da98a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ struct ms_hyperv_info { u32 features; + u32 misc_features; u32 hints; }; @@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs); void hv_setup_vmbus_irq(void (*handler)(void)); void hv_remove_vmbus_irq(void); +void hv_setup_kexec_handler(void (*handler)(void)); +void hv_remove_kexec_handler(void); +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); +void hv_remove_crash_handler(void); #endif diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 6fe6b182c998..9dfce4e0417d 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short __pad2; /* Was called gs, but was always zero. */ - unsigned short __pad1; /* Was called fs, but was always zero. */ - unsigned short ss; + unsigned short gs; + unsigned short fs; + unsigned short __pad0; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..d7f3b3b78ac3 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15", "flags" + "r12", "r13", "r14", "r15" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,11 +100,7 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL - * has no effect. - */ +/* Save restore flags to clear handle leaking NT */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da772edd19ab..448b7ca61aee 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -47,6 +47,7 @@ #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 @@ -367,29 +368,29 @@ enum vmcs_field { #define TYPE_PHYSICAL_APIC_EVENT (10 << 12) #define TYPE_PHYSICAL_APIC_INST (15 << 12) -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00 #define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f36d56bd7632..f0412c50c47b 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -27,6 +27,8 @@ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* Partition reference TSC MSR is available */ +#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 0e8a973de9ee..40836a9a7250 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,24 +177,9 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - - /* - * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), - * Linux saved and restored fs and gs in these slots. This - * was counterproductive, as fsbase and gsbase were never - * saved, so arch_prctl was presumably unreliable. - * - * If these slots are ever needed for any other purpose, there - * is some risk that very old 64-bit binaries could get - * confused. I doubt that many such binaries still work, - * though, since the same patch in 2.5.64 also removed the - * 64-bit set_thread_area syscall, so it appears that there is - * no TLS API that works in both pre- and post-2.5.64 kernels. - */ - __u16 __pad2; /* Was gs. */ - __u16 __pad1; /* Was fs. */ - - __u16 ss; + __u16 gs; + __u16 fs; + __u16 __pad0; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 1fe92181ee9e..37fee272618f 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -58,6 +58,7 @@ #define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_TRAP_FLAG 37 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 @@ -106,6 +107,7 @@ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dcb52850a28f..cde732c1b495 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1424,7 +1424,7 @@ static inline void __x2apic_disable(void) { u64 msr; - if (cpu_has_apic) + if (!cpu_has_apic) return; rdmsrl(MSR_IA32_APICBASE, msr); @@ -1483,10 +1483,13 @@ void x2apic_setup(void) static __init void x2apic_disable(void) { - u32 x2apic_id; + u32 x2apic_id, state = x2apic_state; - if (x2apic_state != X2APIC_ON) - goto out; + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + if (state != X2APIC_ON) + return; x2apic_id = read_apic_id(); if (x2apic_id >= 255) @@ -1494,9 +1497,6 @@ static __init void x2apic_disable(void) __x2apic_disable(); register_lapic_address(mp_lapic_addr); -out: - x2apic_state = X2APIC_DISABLED; - x2apic_mode = 0; } static __init void x2apic_enable(void) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 845dc0df2002..206052e55517 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -943,7 +943,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) */ if (irq < nr_legacy_irqs() && data->count == 1) { if (info->ioapic_trigger != data->trigger) - mp_register_handler(irq, data->trigger); + mp_register_handler(irq, info->ioapic_trigger); data->entry.trigger = data->trigger = info->ioapic_trigger; data->entry.polarity = data->polarity = info->ioapic_polarity; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f813261d9740..2683f36e4e0a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -322,7 +322,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq, irq_data->node, data, + err = assign_irq_vector_policy(virq + i, irq_data->node, data, info); if (err) goto error; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 922c5e0cea4c..cb9e5df42dd2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1410,7 +1410,7 @@ void cpu_init(void) load_sp0(t, ¤t->thread); set_tss_desc(cpu, t); load_TR_desc(); - load_LDT(&init_mm.context); + load_mm_ldt(&init_mm); clear_all_debug_regs(); dbg_restore_debug_regs(); @@ -1459,7 +1459,7 @@ void cpu_init(void) load_sp0(t, thread); set_tss_desc(cpu, t); load_TR_desc(); - load_LDT(&init_mm.context); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index aad4bd84b475..f794bfa3c138 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/kexec.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -28,10 +29,14 @@ #include <asm/i8259.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/reboot.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); + #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); @@ -67,8 +72,47 @@ void hv_remove_vmbus_irq(void) } EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler); #endif +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + native_machine_shutdown(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + native_machine_crash_shutdown(regs); +} + + static uint32_t __init ms_hyperv_platform(void) { u32 eax; @@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", @@ -141,6 +186,8 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3658de47900f..9469dfa55607 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment) int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct ldt_struct *ldt; + if (idx > LDT_ENTRIES) return 0; - if (idx > current->active_mm->context.size) + /* IRQs are off, so this synchronizes with smp_store_release */ + ldt = lockless_dereference(current->active_mm->context.ldt); + if (!ldt || idx > ldt->size) return 0; - desc = current->active_mm->context.ldt; + desc = &ldt->entries[idx]; } else { if (idx > GDT_ENTRIES) return 0; - desc = raw_cpu_ptr(gdt_page.gdt); + desc = raw_cpu_ptr(gdt_page.gdt) + idx; } - return get_desc_base(desc + idx); + return get_desc_base(desc); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b9826a981fb2..6326ae24e4d5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2534,7 +2534,7 @@ static int intel_pmu_cpu_prepare(int cpu) if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) - return NOTIFY_BAD; + goto err; } if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { @@ -2542,18 +2542,27 @@ static int intel_pmu_cpu_prepare(int cpu) cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); if (!cpuc->constraint_list) - return NOTIFY_BAD; + goto err_shared_regs; cpuc->excl_cntrs = allocate_excl_cntrs(cpu); - if (!cpuc->excl_cntrs) { - kfree(cpuc->constraint_list); - kfree(cpuc->shared_regs); - return NOTIFY_BAD; - } + if (!cpuc->excl_cntrs) + goto err_constraint_list; + cpuc->excl_thread_id = 0; } return NOTIFY_OK; + +err_constraint_list: + kfree(cpuc->constraint_list); + cpuc->constraint_list = NULL; + +err_shared_regs: + kfree(cpuc->shared_regs); + cpuc->shared_regs = NULL; + +err: + return NOTIFY_BAD; } static void intel_pmu_cpu_starting(int cpu) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 63eb68b73589..377e8f8ed391 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -1255,7 +1255,7 @@ static inline void cqm_pick_event_reader(int cpu) cpumask_set_cpu(cpu, &cqm_cpumask); } -static void intel_cqm_cpu_prepare(unsigned int cpu) +static void intel_cqm_cpu_starting(unsigned int cpu) { struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1296,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb, unsigned int cpu = (unsigned long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - intel_cqm_cpu_prepare(cpu); - break; case CPU_DOWN_PREPARE: intel_cqm_cpu_exit(cpu); break; case CPU_STARTING: + intel_cqm_cpu_starting(cpu); cqm_pick_event_reader(cpu); break; } @@ -1373,7 +1371,7 @@ static int __init intel_cqm_init(void) goto out; for_each_online_cpu(i) { - intel_cqm_cpu_prepare(i); + intel_cqm_cpu_starting(i); cqm_pick_event_reader(i); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 79de954626fd..d25097c3fc1d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -270,7 +270,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (src_fpu->fpstate_active) + if (src_fpu->fpstate_active && cpu_has_fpu) fpu_copy(dst_fpu, src_fpu); return 0; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 1e173f6285c7..d14e9ac3235a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -40,7 +40,12 @@ static void fpu__init_cpu_generic(void) write_cr0(cr0); /* Flush out any pending x87 state: */ - asm volatile ("fninit"); +#ifdef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) + fpstate_init_soft(¤t->thread.fpu.state.soft); + else +#endif + asm volatile ("fninit"); } /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c37886d759cc..2bcc0525f1c1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -20,82 +21,82 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> -#ifdef CONFIG_SMP +/* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *current_mm) { - if (current->active_mm == current_mm) - load_LDT(¤t->active_mm->context); + mm_context_t *pc; + + if (current->active_mm != current_mm) + return; + + pc = ¤t->active_mm->context; + set_ldt(pc->ldt->entries, pc->ldt->size); } -#endif -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ +static struct ldt_struct *alloc_ldt_struct(int size) { - void *oldldt, *newldt; - int oldsize; - - if (mincount <= pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & - (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); - if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + struct ldt_struct *new_ldt; + int alloc_size; + + if (size > LDT_ENTRIES) + return NULL; + + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + if (!new_ldt) + return NULL; + + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); + alloc_size = size * LDT_ENTRY_SIZE; + + /* + * Xen is very picky: it requires a page-aligned LDT that has no + * trailing nonzero bytes in any page that contains LDT descriptors. + * Keep it simple: zero the whole allocation and never allocate less + * than PAGE_SIZE. + */ + if (alloc_size > PAGE_SIZE) + new_ldt->entries = vzalloc(alloc_size); else - newldt = (void *)__get_free_page(GFP_KERNEL); - - if (!newldt) - return -ENOMEM; + new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (oldsize) - memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, - (mincount - oldsize) * LDT_ENTRY_SIZE); + if (!new_ldt->entries) { + kfree(new_ldt); + return NULL; + } - paravirt_alloc_ldt(newldt, mincount); + new_ldt->size = size; + return new_ldt; +} -#ifdef CONFIG_X86_64 - /* CHECKME: Do we really need this ? */ - wmb(); -#endif - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - - if (reload) { -#ifdef CONFIG_SMP - preempt_disable(); - load_LDT(pc); - if (!cpumask_equal(mm_cpumask(current->mm), - cpumask_of(smp_processor_id()))) - smp_call_function(flush_ldt, current->mm, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - paravirt_free_ldt(oldldt, oldsize); - if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - put_page(virt_to_page(oldldt)); - } - return 0; +/* After calling this, the LDT is immutable. */ +static void finalize_ldt_struct(struct ldt_struct *ldt) +{ + paravirt_alloc_ldt(ldt->entries, ldt->size); } -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +/* context.lock is held */ +static void install_ldt(struct mm_struct *current_mm, + struct ldt_struct *ldt) { - int err = alloc_ldt(new, old->size, 0); - int i; + /* Synchronizes with lockless_dereference in load_mm_ldt. */ + smp_store_release(¤t_mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using current_mm. */ + on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); +} - if (err < 0) - return err; +static void free_ldt_struct(struct ldt_struct *ldt) +{ + if (likely(!ldt)) + return; - for (i = 0; i < old->size; i++) - write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); - return 0; + paravirt_free_ldt(ldt->entries, ldt->size); + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + kfree(ldt->entries); + kfree(ldt); } /* @@ -104,17 +105,37 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) */ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + struct ldt_struct *new_ldt; struct mm_struct *old_mm; int retval = 0; mutex_init(&mm->context.lock); - mm->context.size = 0; old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); + if (!old_mm) { + mm->context.ldt = NULL; + return 0; } + + mutex_lock(&old_mm->context.lock); + if (!old_mm->context.ldt) { + mm->context.ldt = NULL; + goto out_unlock; + } + + new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); + if (!new_ldt) { + retval = -ENOMEM; + goto out_unlock; + } + + memcpy(new_ldt->entries, old_mm->context.ldt->entries, + new_ldt->size * LDT_ENTRY_SIZE); + finalize_ldt_struct(new_ldt); + + mm->context.ldt = new_ldt; + +out_unlock: + mutex_unlock(&old_mm->context.lock); return retval; } @@ -125,53 +146,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { -#ifdef CONFIG_X86_32 - /* CHECKME: Can this ever happen ? */ - if (mm == current->active_mm) - clear_LDT(); -#endif - paravirt_free_ldt(mm->context.ldt, mm->context.size); - if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - put_page(virt_to_page(mm->context.ldt)); - mm->context.size = 0; - } + free_ldt_struct(mm->context.ldt); + mm->context.ldt = NULL; } static int read_ldt(void __user *ptr, unsigned long bytecount) { - int err; + int retval; unsigned long size; struct mm_struct *mm = current->mm; - if (!mm->context.size) - return 0; + mutex_lock(&mm->context.lock); + + if (!mm->context.ldt) { + retval = 0; + goto out_unlock; + } + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; - mutex_lock(&mm->context.lock); - size = mm->context.size * LDT_ENTRY_SIZE; + size = mm->context.ldt->size * LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; + if (copy_to_user(ptr, mm->context.ldt->entries, size)) { + retval = -EFAULT; + goto out_unlock; + } + if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr + size, bytecount - size) != 0) { - err = -EFAULT; - goto error_return; + /* Zero-fill the rest and pretend we read bytecount bytes. */ + if (clear_user(ptr + size, bytecount - size)) { + retval = -EFAULT; + goto out_unlock; } } - return bytecount; -error_return: - return err; + retval = bytecount; + +out_unlock: + mutex_unlock(&mm->context.lock); + return retval; } static int read_default_ldt(void __user *ptr, unsigned long bytecount) @@ -195,6 +210,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) struct desc_struct ldt; int error; struct user_desc ldt_info; + int oldsize, newsize; + struct ldt_struct *new_ldt, *old_ldt; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -213,34 +230,39 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) goto out; } - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, - ldt_info.entry_number + 1, 1); - if (error < 0) - goto out_unlock; - } - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - memset(&ldt, 0, sizeof(ldt)); - goto install; + if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || + LDT_empty(&ldt_info)) { + /* The user wants to clear the entry. */ + memset(&ldt, 0, sizeof(ldt)); + } else { + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + error = -EINVAL; + goto out; } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; } - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { - error = -EINVAL; + mutex_lock(&mm->context.lock); + + old_ldt = mm->context.ldt; + oldsize = old_ldt ? old_ldt->size : 0; + newsize = max((int)(ldt_info.entry_number + 1), oldsize); + + error = -ENOMEM; + new_ldt = alloc_ldt_struct(newsize); + if (!new_ldt) goto out_unlock; - } - fill_ldt(&ldt, &ldt_info); - if (oldmode) - ldt.avl = 0; + if (old_ldt) + memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); + new_ldt->entries[ldt_info.entry_number] = ldt; + finalize_ldt_struct(new_ldt); - /* Install the new entry ... */ -install: - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + install_ldt(mm, new_ldt); + free_ldt_struct(old_ldt); error = 0; out_unlock: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 397688beed4b..c27cad726765 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -408,6 +408,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) static void mwait_idle(void) { if (!current_set_polling_and_test()) { + trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { smp_mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); @@ -419,6 +420,7 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71d7849a07f7..f6b916387590 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all) void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { - if (dead_task->mm->context.size) { + if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt, - dead_task->mm->context.size); + dead_task->mm->context.ldt->size); BUG(); } } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 206996c1669d..71820c42b6ce 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,8 +93,15 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -154,9 +161,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->__pad2); - put_user_ex(0, &sc->__pad1); - put_user_ex(regs->ss, &sc->ss); + put_user_ex(0, &sc->gs); + put_user_ex(0, &sc->fs); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -451,19 +457,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* - * Set up the CS and SS registers to run signal handlers in - * 64-bit mode, even if the handler happens to be interrupting - * 32-bit or 16-bit code. - * - * SS is subtle. In 64-bit mode, we don't need any particular - * SS descriptor, but we do need SS to be valid. It's possible - * that the old SS is entirely bogus -- this can happen if the - * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. - */ + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ regs->cs = __USER_CS; - regs->ss = __USER_DS; return 0; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 9b4d51d0c0d0..0ccb53a9fcd9 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/ptrace.h> #include <asm/desc.h> +#include <asm/mmu_context.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -27,13 +28,14 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re struct desc_struct *desc; unsigned long base; - seg &= ~7UL; + seg >>= 3; mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) + if (unlikely(!child->mm->context.ldt || + seg >= child->mm->context.ldt->size)) addr = -1L; /* bogus selector, access would fault */ else { - desc = child->mm->context.ldt + seg; + desc = &child->mm->context.ldt->entries[seg]; base = get_desc_base(desc); /* 16-bit code segment? */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 67d215cb8953..a1ff508bb423 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ + hyperv.o + kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c new file mode 100644 index 000000000000..a8160d2ae362 --- /dev/null +++ b/arch/x86/kvm/hyperv.c @@ -0,0 +1,377 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "x86.h" +#include "lapic.h" +#include "hyperv.h" + +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + r = true; + break; + } + + return r; +} + +static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + *pdata = hv->hv_crash_param[index]; + return 0; +} + +static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + *pdata = hv->hv_crash_ctl; + return 0; +} + +static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (host) + hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; + + if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { + + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + + return 0; +} + +static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 data) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + hv->hv_crash_param[index] = data; + return 0; +} + +static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, + bool host) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + hv->hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!hv->hv_guest_os_id) + hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!hv->hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + hv->hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (__copy_to_user((void __user *)addr, instructions, 4)) + return 1; + hv->hv_hypercall = data; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + + memset(&tsc_ref, 0, sizeof(tsc_ref)); + hv->hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest( + kvm, + gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_set_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + data); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + return 0; +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + u64 gfn; + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + hv->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; + break; + } + gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); + if (kvm_is_error_hva(addr)) + return 1; + if (__clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + hv->hv_vapic = data; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, + gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + + return 0; +} + +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = hv->hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = hv->hv_hypercall; + break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = hv->hv_tsc_page; + break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_get_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + pdata); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { + data = r; + break; + } + } + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + data = hv->hv_vapic; + break; + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_set_msr_pw(vcpu, msr, data, host); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_set_msr(vcpu, msr, data); +} + +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_get_msr(vcpu, msr, pdata); +} + +bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + longmode = is_64_bit_mode(vcpu); + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h new file mode 100644 index 000000000000..c7bce559f67b --- /dev/null +++ b/arch/x86/kvm/hyperv.h @@ -0,0 +1,32 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef __ARCH_X86_KVM_HYPERV_H__ +#define __ARCH_X86_KVM_HYPERV_H__ + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +bool kvm_hv_hypercall_enabled(struct kvm *kvm); +int kvm_hv_hypercall(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index fef922ff2635..7cc2360f1848 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -651,15 +651,10 @@ fail_unlock: return NULL; } -void kvm_destroy_pic(struct kvm *kvm) +void kvm_destroy_pic(struct kvm_pic *vpic) { - struct kvm_pic *vpic = kvm->arch.vpic; - - if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); - kvm->arch.vpic = NULL; - kfree(vpic); - } + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ad68c73008c5..3d782a2c336a 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -74,7 +74,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm_pic *vpic); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -85,11 +85,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) static inline int irqchip_in_kernel(struct kvm *kvm) { - int ret; + struct kvm_pic *vpic = pic_irqchip(kvm); - ret = (pic_irqchip(kvm) != NULL); + /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return ret; + return vpic != NULL; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2a5ca97c263b..9a3e342e3cda 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1900,8 +1900,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) + return; apic_set_tpr(vcpu->arch.apic, data & 0xff); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71952748222a..764037991d26 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -91,7 +91,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; + return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44171462bd2a..fb16a8ea3dee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -357,12 +357,6 @@ static u64 __get_spte_lockless(u64 *sptep) { return ACCESS_ONCE(*sptep); } - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - /* It is valid if the spte is zapped. */ - return spte == 0ull; -} #else union split_spte { struct { @@ -478,23 +472,6 @@ retry: return spte.spte; } - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - union split_spte sspte = (union split_spte)spte; - u32 high_mmio_mask = shadow_mmio_mask >> 32; - - /* It is valid if the spte is zapped. */ - if (spte == 0ull) - return true; - - /* It is valid if the spte is being zapped. */ - if (sspte.spte_low == 0ull && - (sspte.spte_high & high_mmio_mask) == high_mmio_mask) - return true; - - return false; -} #endif static bool spte_is_locklessly_modifiable(u64 spte) @@ -3291,54 +3268,89 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static bool +__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { - if (direct) - return vcpu_match_mmio_gpa(vcpu, addr); + int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; - return vcpu_match_mmio_gva(vcpu, addr); + return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | + ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); +} -/* - * On direct hosts, the last spte is only allows two states - * for mmio page fault: - * - It is the mmio spte - * - It is zapped or it is being zapped. - * - * This function completely checks the spte when the last spte - * is not the mmio spte. - */ -static bool check_direct_spte_mmio_pf(u64 spte) +static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) { - return __check_direct_spte_mmio_pf(spte); + return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); } -static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + if (direct) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + +/* return true if reserved bit is detected on spte. */ +static bool +walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 spte = 0ull; + u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; + int root, leaf; + bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return spte; + goto exit; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) + + for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level; + shadow_walk_okay(&iterator); + __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; + spte = mmu_spte_get_lockless(iterator.sptep); + + sptes[leaf - 1] = spte; + if (!is_shadow_present_pte(spte)) break; + + reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + leaf); + } + walk_shadow_page_lockless_end(vcpu); - return spte; + if (reserved) { + pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + __func__, addr); + while (root >= leaf) { + pr_err("------ spte 0x%llx level %d.\n", + sptes[root - 1], root); + root--; + } + } +exit: + *sptep = spte; + return reserved; } int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; + bool reserved; if (quickly_check_mmio_pf(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; - spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + if (unlikely(reserved)) + return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); @@ -3356,13 +3368,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) } /* - * It's ok if the gva is remapped by other cpus on shadow guest, - * it's a BUG if the gfn is not a mmio page. - */ - if (direct && !check_direct_spte_mmio_pf(spte)) - return RET_MMIO_PF_BUG; - - /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ @@ -3604,19 +3609,21 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void +__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, int level, bool nx, bool gbpages, + bool pse) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; - context->bad_mt_xwr = 0; + rsvd_check->bad_mt_xwr = 0; - if (!context->nx) + if (!nx) exb_bit_rsvd = rsvd_bits(63, 63); - if (!guest_cpuid_has_gbpages(vcpu)) + if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); /* @@ -3626,80 +3633,95 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, if (guest_cpuid_is_amd(vcpu)) nonleaf_bit8_rsvd = rsvd_bits(8, 8); - switch (context->root_level) { + switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ - context->rsvd_bits_mask[0][1] = 0; - context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[0][1] = 0; + rsvd_check->rsvd_bits_mask[0][0] = 0; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; + if (!pse) { + rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PTE */ - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: - context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; } } -static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), context->root_level, + context->nx, guest_cpuid_has_gbpages(vcpu), + is_pse(vcpu)); +} + +static void +__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, bool execonly) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); int pte; - context->rsvd_bits_mask[0][3] = + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][1] = + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = + rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - context->rsvd_bits_mask[1][1] = + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; for (pte = 0; pte < 64; pte++) { int rwx_bits = pte & 7; @@ -3707,10 +3729,64 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, if (mt == 0x2 || mt == 0x3 || mt == 0x7 || rwx_bits == 0x2 || rwx_bits == 0x6 || (rwx_bits == 0x4 && !execonly)) - context->bad_mt_xwr |= (1ull << pte); + rsvd_check->bad_mt_xwr |= (1ull << pte); } } +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), execonly); +} + +/* + * the page table on host is the shadow page table for the page + * table in guest or amd nested guest, its mmu features completely + * follow the features in guest. + */ +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, context->nx, + guest_cpuid_has_gbpages(vcpu), is_pse(vcpu)); +} +EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); + +/* + * the direct page table on host, use as much mmu features as + * possible, however, kvm currently does not do execution-protection. + */ +static void +reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + if (guest_cpuid_is_amd(vcpu)) + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, false, + cpu_has_gbpages, true); + else + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + false); + +} + +/* + * as the comments in reset_shadow_zero_bits_mask() except it + * is the shadow page table for intel nested guest. + */ +static void +reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, execonly); +} + static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { @@ -3889,6 +3965,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); + reset_tdp_shadow_zero_bits_mask(vcpu, context); } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) @@ -3916,6 +3993,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); context->base_role.smm = is_smm(vcpu); + reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -3939,6 +4017,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4860,28 +4939,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte; - int nr_sptes = 0; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return nr_sptes; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { - sptes[iterator.level-1] = spte; - nr_sptes++; - if (!is_shadow_present_pte(spte)) - break; - } - walk_shadow_page_lockless_end(vcpu); - - return nr_sptes; -} -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 398d21c0f6dd..e4202e41d535 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -50,9 +50,11 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + /* * Return values of handle_mmio_page_fault_common: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index dc0a84a6f309..9e8bf13572e6 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -672,16 +672,16 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) if (iter.mtrr_disabled) return mtrr_disabled_type(); + /* not contained in any MTRRs. */ + if (type == -1) + return mtrr_default_type(mtrr_state); + /* * We just check one page, partially covered by MTRRs is * impossible. */ WARN_ON(iter.partial_map); - /* not contained in any MTRRs. */ - if (type == -1) - return mtrr_default_type(mtrr_state); - return type; } EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0f67d7e24800..736e6ab8784d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) *access &= mask; } -static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; - - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | - ((mmu->bad_mt_xwr & (1ull << low6)) != 0); -} - static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT @@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!FNAME(is_present_gpte)(gpte)) @@ -353,8 +345,7 @@ retry_walk: if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, - walker->level))) { + if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 886aa25a7131..39b91127ef07 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -133,8 +133,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_K7_PERFCTRn */ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); if (pmc) { - if (!msr_info->host_initiated) - data = (s64)data; pmc->counter += data - pmc_read_counter(pmc); return 0; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8e0c0844c6b9..74d825716f4f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1173,6 +1173,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm)) return 0; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED) && + kvm_read_cr0(vcpu) & X86_CR0_CD) + return _PAGE_NOCACHE; + mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); return mtrr2protval[mtrr]; } @@ -1667,13 +1671,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - /* - * re-enable caching here because the QEMU bios - * does not do it - this results in some delay at - * reboot - */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + + /* These are emulated via page tables. */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); @@ -2106,6 +2107,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); + reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 83b7b5cd75d5..da1590ea43fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2443,10 +2443,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we @@ -3423,12 +3423,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(to_vmx(vcpu)); guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~AR_TYPE_MASK) - | AR_TYPE_BUSY_64_TSS); + (guest_tr_ar & ~VMX_AR_TYPE_MASK) + | VMX_AR_TYPE_BUSY_64_TSS); } vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); } @@ -3719,7 +3719,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) return 0; else { int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return AR_DPL(ar); + return VMX_AR_DPL(ar); } } @@ -3847,11 +3847,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) if (cs.unusable) return false; - if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; - if (cs.type & AR_TYPE_WRITEABLE_MASK) { + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; } else { @@ -3901,7 +3901,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) return false; if (!var.present) return false; - if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ return false; } @@ -5759,73 +5759,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } -static u64 ept_rsvd_mask(u64 spte, int level) -{ - int i; - u64 mask = 0; - - for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) - mask |= (1ULL << i); - - if (level == 4) - /* bits 7:3 reserved */ - mask |= 0xf8; - else if (spte & (1ULL << 7)) - /* - * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, - * level == 1 if the hypervisor is using the ignored bit 7. - */ - mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; - else if (level > 1) - /* bits 6:3 reserved */ - mask |= 0x78; - - return mask; -} - -static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, - int level) -{ - printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); - - /* 010b (write-only) */ - WARN_ON((spte & 0x7) == 0x2); - - /* 110b (write/execute) */ - WARN_ON((spte & 0x7) == 0x6); - - /* 100b (execute-only) and value not supported by logical processor */ - if (!cpu_has_vmx_ept_execute_only()) - WARN_ON((spte & 0x7) == 0x4); - - /* not 000b */ - if ((spte & 0x7)) { - u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); - - if (rsvd_bits != 0) { - printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", - __func__, rsvd_bits); - WARN_ON(1); - } - - /* bits 5:3 are _not_ reserved for large page or leaf page */ - if ((rsvd_bits & 0x38) == 0) { - u64 ept_mem_type = (spte & 0x38) >> 3; - - if (ept_mem_type == 2 || ept_mem_type == 3 || - ept_mem_type == 7) { - printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", - __func__, ept_mem_type); - WARN_ON(1); - } - } - } -} - static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { - u64 sptes[4]; - int nr_sptes, i, ret; + int ret; gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -5846,13 +5782,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; /* It is the real ept misconfig */ - printk(KERN_ERR "EPT: Misconfiguration.\n"); - printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); - - nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); - - for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) - ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + WARN_ON(1); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; @@ -6246,6 +6176,11 @@ static int handle_mwait(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } +static int handle_monitor_trap(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int handle_monitor(struct kvm_vcpu *vcpu) { printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); @@ -6408,8 +6343,12 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) */ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) + u32 vmx_instruction_info, bool wr, gva_t *ret) { + gva_t off; + bool exn; + struct kvm_segment s; + /* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the @@ -6434,22 +6373,63 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); + off = exit_qualification; /* holds the displacement */ if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); + off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ + off += kvm_register_read(vcpu, index_reg)<<scaling; + vmx_get_segment(vcpu, &s, seg_reg); + *ret = s.base + off; if (addr_size == 1) /* 32 bit */ *ret &= 0xffffffff; - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ + /* Checks for #GP/#SS exceptions. */ + exn = false; + if (is_protmode(vcpu)) { + /* Protected mode: apply checks for segment validity in the + * following order: + * - segment type check (#GP(0) may be thrown) + * - usability check (#GP(0)/#SS(0)) + * - limit check (#GP(0)/#SS(0)) + */ + if (wr) + /* #GP(0) if the destination operand is located in a + * read-only data segment or any code segment. + */ + exn = ((s.type & 0xa) == 0 || (s.type & 8)); + else + /* #GP(0) if the source operand is located in an + * execute-only code segment + */ + exn = ((s.type & 0xa) == 8); + } + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is an only check for long mode. + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { + /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. + */ + exn = (s.unusable != 0); + /* Protected mode: #GP(0)/#SS(0) if the memory + * operand is outside the segment limit. + */ + exn = exn || (off + sizeof(u64) > s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + return 0; } @@ -6471,7 +6451,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, @@ -6999,7 +6979,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, true, &gva)) return 1; /* _system ok, as nested_vmx_check_permission verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, @@ -7036,7 +7016,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { @@ -7128,7 +7108,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &vmcs_gva)) + vmx_instruction_info, true, &vmcs_gva)) return 1; /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, @@ -7184,7 +7164,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, sizeof(operand), &e)) { @@ -7282,6 +7262,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, @@ -7542,6 +7523,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ef2560075bf..4bbc2a1676c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -29,6 +29,7 @@ #include "cpuid.h" #include "assigned-dev.h" #include "pmu.h" +#include "hyperv.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -221,11 +222,9 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); + shared_msrs_global.msrs[slot] = msr; if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot] = msr; - /* we need ensured the shared_msr_global have been updated */ - smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); @@ -526,7 +525,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if (is_present_gpte(pdpte[i]) && - (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + (pdpte[i] & + vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -949,6 +949,8 @@ static u32 emulated_msrs[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1217,11 +1219,6 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, __func__, base_khz, scaled_khz, shift, *pmultiplier); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif @@ -1869,123 +1866,6 @@ out: return r; } -static bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - case HV_X64_MSR_REFERENCE_TSC: - case HV_X64_MSR_TIME_REF_COUNT: - r = true; - break; - } - - return r; -} - -static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - kvm->arch.hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!kvm->arch.hv_guest_os_id) - kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!kvm->arch.hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - kvm->arch.hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - kvm->arch.hv_hypercall = data; - mark_page_dirty(kvm, gfn); - break; - } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - memset(&tsc_ref, 0, sizeof(tsc_ref)); - kvm->arch.hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); - break; - } - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - return 0; -} - -static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - vcpu->arch.hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - vcpu->arch.hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) - return 1; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - - return 0; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -2105,7 +1985,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (guest_cpuid_has_tsc_adjust(vcpu)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; - kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + adjust_tsc_offset_guest(vcpu, adj); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -2224,15 +2104,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = set_msr_hyperv_pw(vcpu, msr, data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return set_msr_hyperv(vcpu, msr, data); - break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_set_msr_common(vcpu, msr, data, + msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. @@ -2315,68 +2190,6 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = kvm->arch.hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = kvm->arch.hv_hypercall; - break; - case HV_X64_MSR_TIME_REF_COUNT: { - data = - div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); - break; - } - case HV_X64_MSR_REFERENCE_TSC: - data = kvm->arch.hv_tsc_page; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = vcpu->arch.hv_vapic; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { switch (msr_info->index) { @@ -2493,14 +2306,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr_info->index)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_get_msr_common(vcpu, + msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2651,6 +2460,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: + case KVM_CAP_SET_BOOT_CPU_ID: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3817,30 +3627,25 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { mutex_lock(&kvm->slots_lock); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_eclr); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->slots_lock); - kfree(vpic); goto create_irqchip_unlock; } } else goto create_irqchip_unlock; - smp_wmb(); - kvm->arch.vpic = vpic; - smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(kvm); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; } + /* Write kvm->irq_routing before kvm->arch.vpic. */ + smp_wmb(); + kvm->arch.vpic = vpic; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -3967,6 +3772,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_reinject(kvm, &control); break; } + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->arch.bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; case KVM_XEN_HVM_CONFIG: { r = -EFAULT; if (copy_from_user(&kvm->arch.xen_hvm_config, argp, @@ -5882,66 +5696,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - - ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - - return 1; -} - /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6327,6 +6081,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) static void process_smi(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; + struct desc_ptr dt; char buf[512]; u32 cr0; @@ -6359,6 +6114,10 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_x86_ops->set_cr4(vcpu, 0); + /* Undocumented: IDT limit is set to zero on entry to SMM. */ + dt.address = dt.size = 0; + kvm_x86_ops->set_idt(vcpu, &dt); + __kvm_set_dr(vcpu, 7, DR7_FIXED_1); cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; @@ -6513,6 +6272,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); + if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -7535,6 +7300,17 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); + +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; +} + bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 0ca2f3e4803c..2f822cd886c2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } +static inline u64 get_kernel_ns(void) +{ + return ktime_get_boot_ns(); +} + static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index f37e84ab49f3..3d8f2e421466 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -29,7 +29,6 @@ #include <asm/uaccess.h> #include <asm/traps.h> -#include <asm/desc.h> #include <asm/user.h> #include <asm/fpu/internal.h> @@ -181,7 +180,7 @@ void math_emulate(struct math_emu_info *info) math_abort(FPU_info, SIGILL); } - code_descriptor = LDT_DESCRIPTOR(FPU_CS); + code_descriptor = FPU_get_ldt_descriptor(FPU_CS); if (SEG_D_SIZE(code_descriptor)) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 9ccecb61a4fa..5e044d506b7a 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -16,9 +16,24 @@ #include <linux/kernel.h> #include <linux/mm.h> -/* s is always from a cpu register, and the cpu does bounds checking - * during register load --> no further bounds checks needed */ -#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3]) +#include <asm/desc.h> +#include <asm/mmu_context.h> + +static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) +{ + static struct desc_struct zero_desc; + struct desc_struct ret = zero_desc; + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + seg >>= 3; + mutex_lock(¤t->mm->context.lock); + if (current->mm->context.ldt && seg < current->mm->context.ldt->size) + ret = current->mm->context.ldt->entries[seg]; + mutex_unlock(¤t->mm->context.lock); +#endif + return ret; +} + #define SEG_D_SIZE(x) ((x).b & (3 << 21)) #define SEG_G_BIT(x) ((x).b & (1 << 23)) #define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 6ef5e99380f9..8300db71c2a6 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -20,7 +20,6 @@ #include <linux/stddef.h> #include <asm/uaccess.h> -#include <asm/desc.h> #include "fpu_system.h" #include "exception.h" @@ -158,7 +157,7 @@ static long pm_address(u_char FPU_modrm, u_char segment, addr->selector = PM_REG_(segment); } - descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); + descriptor = FPU_get_ldt_descriptor(addr->selector); base_address = SEG_BASE_ADDR(descriptor); address = base_address + offset; limit = base_address diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd74be0..be2e7a2b10d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -269,7 +269,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 44 /* number of bytes to jump */ +#define OFFSET1 47 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -278,15 +278,15 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 33 +#define OFFSET2 36 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ /* prog = array->prog[index]; */ - EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ - EMIT1(offsetof(struct bpf_array, prog)); + EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + offsetof(struct bpf_array, prog)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index cfba30f27392..e4308fe6afe8 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -972,6 +972,11 @@ u64 efi_mem_attributes(unsigned long phys_addr) static int __init arch_parse_efi_cmdline(char *str) { + if (!str) { + pr_warn("need at least one option\n"); + return -EINVAL; + } + if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); if (parse_option_str(str, "debug")) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 0d7dd1f5ac36..9ab52791fed5 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -22,6 +22,7 @@ #include <asm/fpu/internal.h> #include <asm/debugreg.h> #include <asm/cpu.h> +#include <asm/mmu_context.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -153,7 +154,7 @@ static void fix_processor_context(void) syscall_init(); /* This sets MSR_*STAR and related */ #endif load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ + load_mm_ldt(current->active_mm); /* This does lldt */ fpu__resume_cpu(); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e88fda867a33..484145368a24 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -8,7 +8,7 @@ config XEN select PARAVIRT_CLOCK select XEN_HAVE_PVMMU depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_TSC + depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the @@ -17,7 +17,7 @@ config XEN config XEN_DOM0 def_bool y depends on XEN && PCI_XEN && SWIOTLB_XEN - depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI + depends on X86_IO_APIC && ACPI && PCI config XEN_PVHVM def_bool y diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7322755f337a..4b6e29ac0968 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,13 +13,13 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o + p2m.o apic.o obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += apic.o vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0b95c9b8283f..11d6fb4e8483 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -483,6 +483,7 @@ static void set_aliased_prot(void *v, pgprot_t prot) pte_t pte; unsigned long pfn; struct page *page; + unsigned char dummy; ptep = lookup_address((unsigned long)v, &level); BUG_ON(ptep == NULL); @@ -492,6 +493,32 @@ static void set_aliased_prot(void *v, pgprot_t prot) pte = pfn_pte(pfn, prot); + /* + * Careful: update_va_mapping() will fail if the virtual address + * we're poking isn't populated in the page tables. We don't + * need to worry about the direct map (that's always in the page + * tables), but we need to be careful about vmap space. In + * particular, the top level page table can lazily propagate + * entries between processes, so if we've switched mms since we + * vmapped the target in the first place, we might not have the + * top-level page table entry populated. + * + * We disable preemption because we want the same mm active when + * we probe the target and when we issue the hypercall. We'll + * have the same nominal mm, but if we're a kernel thread, lazy + * mm dropping could change our pgd. + * + * Out of an abundance of caution, this uses __get_user() to fault + * in the target address just in case there's some obscure case + * in which the target address isn't readable. + */ + + preempt_disable(); + + pagefault_disable(); /* Avoid warnings due to being atomic. */ + __get_user(dummy, (unsigned char __user __force *)v); + pagefault_enable(); + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) BUG(); @@ -503,6 +530,8 @@ static void set_aliased_prot(void *v, pgprot_t prot) BUG(); } else kmap_flush_unused(); + + preempt_enable(); } static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) @@ -510,6 +539,17 @@ static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; + /* + * We need to mark the all aliases of the LDT pages RO. We + * don't need to call vm_flush_aliases(), though, since that's + * only responsible for flushing aliases out the TLBs, not the + * page tables, and Xen will flush the TLB for us if needed. + * + * To avoid confusing future readers: none of this is necessary + * to load the LDT. The hypervisor only checks this when the + * LDT is faulted in due to subsequent descriptor access. + */ + for(i = 0; i < entries; i += entries_per_page) set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c20fe29e65f4..2292721b1d10 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -101,17 +101,15 @@ struct dom0_vga_console_info; #ifdef CONFIG_XEN_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); -void __init xen_init_apic(void); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) { } -static inline void __init xen_init_apic(void) -{ -} #endif +void __init xen_init_apic(void); + #ifdef CONFIG_XEN_EFI extern void xen_efi_init(void); #else |