diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 233 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/iommu.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 154 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 351 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 38 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 1123 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 210 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 3 |
15 files changed, 1708 insertions, 452 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f9d16ff56c6b..413a7bf9efbb 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -39,7 +39,9 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO + select SRCU ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de12c1d379f1..106c01557f2b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -86,6 +86,7 @@ #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) +#define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) @@ -124,6 +125,7 @@ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ +#define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -165,10 +167,10 @@ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ -#define NoBigReal ((u64)1 << 50) /* No big real mode */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ +#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -213,6 +215,7 @@ struct opcode { const struct gprefix *gprefix; const struct escape *esc; const struct instr_dual *idual; + const struct mode_dual *mdual; void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); @@ -240,6 +243,11 @@ struct instr_dual { struct opcode mod3; }; +struct mode_dual { + struct opcode mode32; + struct opcode mode64; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -262,6 +270,13 @@ struct instr_dual { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 +enum x86_transfer_type { + X86_TRANSFER_NONE, + X86_TRANSFER_CALL_JMP, + X86_TRANSFER_RET, + X86_TRANSFER_TASK_SWITCH, +}; + static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (!(ctxt->regs_valid & (1 << nr))) { @@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } if (addr.ea > lim) goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); - if (size > *max_size) - goto bad; + if (lim == 0xffffffff) + *max_size = ~0u; + else { + *max_size = (u64)lim + 1 - addr.ea; + if (size > *max_size) + goto bad; + } la &= (u32)-1; break; } @@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, const struct desc_struct *cs_desc) { enum x86emul_mode mode = ctxt->mode; + int rc; #ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { - u64 efer = 0; + if (ctxt->mode >= X86EMUL_MODE_PROT16) { + if (cs_desc->l) { + u64 efer = 0; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + mode = X86EMUL_MODE_PROT64; + } else + mode = X86EMUL_MODE_PROT32; /* temporary value */ } #endif if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - return assign_eip(ctxt, dst, mode); + rc = assign_eip(ctxt, dst, mode); + if (rc == X86EMUL_CONTINUE) + ctxt->mode = mode; + return rc; } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstcw %0": "+m"(fcw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fcw; return X86EMUL_CONTINUE; @@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstsw %0": "+m"(fsw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fsw; return X86EMUL_CONTINUE; @@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); + /* Increment ESP on POP [ESP] */ + if ((ctxt->d & IncSP) && + base_reg == VCPU_REGS_RSP) + modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; @@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, ops->get_gdt(ctxt, dt); } -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc, - ulong *desc_addr_p) +static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, + u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - *desc_addr_p = addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + addr = dt.address + index * 8; + +#ifdef CONFIG_X86_64 + if (addr >> 32 != 0) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (!(efer & EFER_LMA)) + addr &= (u32)-1; + } +#endif + + *desc_addr_p = addr; + return X86EMUL_CONTINUE; +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) +{ + int rc; + + rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); + if (rc != X86EMUL_CONTINUE) + return rc; + + return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), &ctxt->exception); } @@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { - struct desc_ptr dt; - u16 index = selector >> 3; + int rc; ulong addr; - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); + rc = get_descriptor_ptr(ctxt, selector, &addr); + if (rc != X86EMUL_CONTINUE) + return rc; - addr = dt.address + index * 8; return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, - bool in_task_switch, + enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; @@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; + err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : + GP_VECTOR; /* can't load system descriptor into segment selector */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) + if (seg <= VCPU_SREG_GS && !seg_desc.s) { + if (transfer == X86_TRANSFER_CALL_JMP) + return X86EMUL_UNHANDLEABLE; goto exception; + } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; @@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (seg_desc.s) { /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (!(seg_desc.type & 1)) { + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, selector, + &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, sizeof(base3), &ctxt->exception); @@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) @@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) unsigned long selector; int rc; - rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); + rc = emulate_pop(ctxt, &selector, 2); if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + if (ctxt->op_bytes > 2) + rsp_increment(ctxt, ctxt->op_bytes - 2); rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; @@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + ctxt->ops->set_nmi_mask(ctxt, false); return rc; } @@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, + X86_TRANSFER_RET, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ + /* Success: write back to memory; no update of EAX */ + ctxt->src.type = OP_NONE; ctxt->dst.val = ctxt->src.orig_val; } else { /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.type = OP_REG; + ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; + /* Create write-cycle to dest by writing the same value */ ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; @@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, - cpl, true, NULL); + cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss32(ctxt, &tss_seg); @@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) struct desc_struct old_desc, new_desc; const struct x86_emulate_ops *ops = ctxt->ops; int cpl = ctxt->ops->cpl(ctxt); + enum x86emul_mode prev_mode = ctxt->mode; old_eip = ctxt->_eip; ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, - &new_desc); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + return rc; rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) @@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) rc = em_push(ctxt); /* If we failed, we tainted the memory, but the very least we should restore cs */ - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; + } return rc; fail: ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + ctxt->mode = prev_mode; return rc; } @@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movsxd(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = (s32) ctxt->src.val; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } +#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } @@ -3738,7 +3808,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = { }; static const struct escape escape_d9 = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstcw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3896,7 +3966,7 @@ static const struct escape escape_db = { { } }; static const struct escape escape_dd = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstsw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = { I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N }; +static const struct mode_dual mode_dual_63 = { + N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = { /* 0x60 - 0x67 */ I(ImplicitOps | Stack | No64, em_pusha), I(ImplicitOps | Stack | No64, em_popa), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, MD(ModRM, &mode_dual_63), N, N, N, N, /* 0x68 - 0x6F */ I(SrcImm | Mov | Stack, em_push), @@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), - I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | SrcImmU16, em_ret_far_imm), + I(ImplicitOps, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), @@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I #undef GP #undef EXT +#undef MD +#undef ID #undef D2bv #undef D2bvIP @@ -4563,6 +4639,12 @@ done_prefixes: else opcode = opcode.u.idual->mod012; break; + case ModeDual: + if (ctxt->mode == X86EMUL_MODE_PROT64) + opcode = opcode.u.mdual->mode64; + else + opcode = opcode.u.mdual->mode32; + break; default: return EMULATION_FAILED; } @@ -4860,10 +4942,16 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* optimisation - avoid slow emulated read if Mov */ rc = segmented_read(ctxt, ctxt->dst.addr.mem, &ctxt->dst.val, ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + if (!(ctxt->d & NoWrite) && + rc == X86EMUL_PROPAGATE_FAULT && + ctxt->exception.vector == PF_VECTOR) + ctxt->exception.error_code |= PFERR_WRITE_MASK; goto done; + } } - ctxt->dst.orig_val = ctxt->dst.val; + /* Copy full 64-bit value for CMPXCHG8B. */ + ctxt->dst.orig_val64 = ctxt->dst.val64; special_insn: @@ -4899,11 +4987,6 @@ special_insn: goto threebyte_insn; switch (ctxt->b) { - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - ctxt->dst.val = (s32) ctxt->src.val; - break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cc31f7c06d3d..9541ba34126b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { + memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index b1947e0f3e10..46d4449772bc 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -422,6 +422,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { int i; + struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i < IOAPIC_NUM_PINS; i++) { union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; @@ -443,7 +444,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); spin_lock(&ioapic->lock); - if (trigger_mode != IOAPIC_LEVEL_TRIG) + if (trigger_mode != IOAPIC_LEVEL_TRIG || + kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) continue; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3c9195535ffc..c2e36d934af4 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 17b73eeac8a4..7dbced309ddb 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) gfn += page_size >> PAGE_SHIFT; - + cond_resched(); } return 0; @@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm, kvm_unpin_pages(kvm, pfn, unmap_pages); gfn += unmap_pages; + + cond_resched(); } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d52dcf0776ea..4ee827d7bf36 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> +#include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" @@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap) return count; } -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +void __kvm_apic_update_irr(u32 *pir, void *regs) { u32 i, pir_val; - struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i <= 7; i++) { pir_val = xchg(&pir[i], 0); if (pir_val) - *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; } } +EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); + +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + __kvm_apic_update_irr(pir, apic->regs); +} EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline void apic_set_irr(int vec, struct kvm_lapic *apic) @@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) { return dest == (apic_x2apic_mode(apic) ? X2APIC_BROADCAST : APIC_BROADCAST); } -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) { return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); } -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) +static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { - int result = 0; u32 logical_id; if (kvm_apic_broadcast(apic, mda)) - return 1; + return true; - if (apic_x2apic_mode(apic)) { - logical_id = kvm_apic_get_reg(apic, APIC_LDR); - return logical_id & mda; - } + logical_id = kvm_apic_get_reg(apic, APIC_LDR); - logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); + if (apic_x2apic_mode(apic)) + return ((logical_id >> 16) == (mda >> 16)) + && (logical_id & mda & 0xffff) != 0; + + logical_id = GET_APIC_LOGICAL_ID(logical_id); switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: - if (logical_id & mda) - result = 1; - break; + return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: - if (((logical_id >> 4) == (mda >> 0x4)) - && (logical_id & mda & 0xf)) - result = 1; - break; + return ((logical_id >> 4) == (mda >> 4)) + && (logical_id & mda & 0xf) != 0; default: apic_debug("Bad DFR vcpu %d: %08x\n", apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); - break; + return false; } - - return result; } -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { - int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " @@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) - /* Physical mode. */ - result = kvm_apic_match_physical_addr(target, dest); + if (dest_mode == APIC_DEST_PHYSICAL) + return kvm_apic_match_physical_addr(target, dest); else - /* Logical mode. */ - result = kvm_apic_match_logical_addr(target, dest); - break; + return kvm_apic_match_logical_addr(target, dest); case APIC_DEST_SELF: - result = (target == source); - break; + return target == source; case APIC_DEST_ALLINC: - result = 1; - break; + return true; case APIC_DEST_ALLBUT: - result = (target != source); - break; + return target != source; default: apic_debug("kvm: apic: Bad dest shorthand value %x\n", short_hand); - break; + return false; } - - return result; } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, @@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, ret = true; - if (irq->dest_mode == 0) { /* physical mode */ + if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) goto out; @@ -840,8 +833,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; @@ -1076,25 +1068,72 @@ static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. - */ if (atomic_read(&apic->lapic_timer.pending)) return; atomic_inc(&apic->lapic_timer.pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_set_pending_timer(vcpu); if (waitqueue_active(q)) wake_up_interruptible(q); + + if (apic_lvtt_tscdeadline(apic)) + ktimer->expired_tscdeadline = ktimer->tscdeadline; +} + +/* + * On APICv, this test will cause a busy wait + * during a higher-priority task. + */ + +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + + if (kvm_apic_hw_enabled(apic)) { + int vec = reg & APIC_VECTOR_MASK; + void *bitmap = apic->regs + APIC_ISR; + + if (kvm_x86_ops->deliver_posted_interrupt) + bitmap = apic->regs + APIC_IRR; + + if (apic_test_vector(vec, bitmap)) + return true; + } + return false; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (apic->lapic_timer.expired_tscdeadline == 0) + return; + + if (!lapic_timer_int_injected(vcpu)) + return; + + tsc_deadline = apic->lapic_timer.expired_tscdeadline; + apic->lapic_timer.expired_tscdeadline = 0; + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + + /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ + if (guest_tsc < tsc_deadline) + __delay(tsc_deadline - guest_tsc); } static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; + atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { @@ -1140,6 +1179,7 @@ static void start_apic_timer(struct kvm_lapic *apic) /* lapic timer in tsc deadline mode */ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; + ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; @@ -1154,8 +1194,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1529,7 +1571,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); - apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1739,13 +1781,15 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (kvm_x86_ops->hwapic_irr_update) kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + if (unlikely(kvm_x86_ops->hwapic_isr_update)) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c674fce53cf9..0bc6c656625b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -14,6 +14,7 @@ struct kvm_timer { u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; + u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ }; @@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); +void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +void wait_lapic_expire(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c5e0ba..cee759299a35 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -63,30 +63,16 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG +static bool dbg = 0; +module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - +#define MMU_WARN_ON(x) WARN_ON(x) #else - #define pgprintk(x...) do { } while (0) #define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } +#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 @@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } +static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) != (new_spte & bit_mask); +} + /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (!shadow_accessed_mask) return ret; + /* + * Flush TLB when accessed/dirty bits are changed in the page tables, + * to guarantee consistency between TLB and page tables. + */ + if (spte_is_bit_changed(old_spte, new_spte, + shadow_accessed_mask | shadow_dirty_mask)) + ret = true; + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) @@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, return flush; } +static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + + spte &= ~shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_clear_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + +static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + + spte |= shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_set_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, * Used when we do not need to care about huge page mappings: e.g. during dirty * logging we do not have any such mappings. */ -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { @@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } +/** + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * @kvm: kvm instance + * @slot: slot to clear D-bit + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should clear D-bit + * + * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. + */ +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmapp); + + /* clear the first set bit */ + mask &= mask - 1; + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); + +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * PT level pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + if (kvm_x86_ops->enable_log_dirty_pt_masked) + kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, + mask); + else + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; @@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { - ASSERT(is_empty_shadow_page(sp->spt)); + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (pte_access & ACC_WRITE_MASK) + if (pte_access & ACC_WRITE_MASK) { mark_page_dirty(vcpu->kvm, gfn); + spte |= shadow_dirty_mask; + } set_pte: if (mmu_spte_update(sptep, spte)) @@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, */ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + /* + * Theoretically we could also set dirty bit (and flush TLB) here in + * order to eliminate unnecessary PML logging. See comments in + * set_spte. But fast_page_fault is very unlikely to happen with PML + * enabled, so we do not do this. This might result in the same GPA + * to be logged in PML buffer again when the write really happens, and + * eventually to be called by mark_page_dirty twice. But it's also no + * harm. This also avoids the TLB flush needed after setting dirty bit + * so non-PML cases won't be impacted. + * + * Compare with set_spte where instead shadow_dirty_mask is set. + */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) mark_page_dirty(vcpu->kvm, gfn); @@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), @@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); @@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { @@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGE_SHIFT; @@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, gpa, error_code, true); @@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - ASSERT(is_pae(vcpu)); + MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; context->page_fault = tdp_page_fault; @@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_last_pte_bitmap(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else paging32_init_context(vcpu, context); - vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp + context->base_role.nxe = is_nx(vcpu); + context->base_role.cr4_pae = !!is_pae(vcpu); + context->base_role.cr0_wp = is_write_protection(vcpu); + context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); context->shadow_root_level = kvm_x86_ops->get_tdp_level(); @@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + struct kvm_mmu *context = &vcpu->arch.mmu; + + kvm_init_shadow_mmu(vcpu); + context->set_cr3 = kvm_x86_ops->set_cr3; + context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; + context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) @@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu); else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu); else - return init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } @@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - ASSERT(vcpu); - /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; @@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) void kvm_mmu_setup(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - struct kvm_memory_slot *memslot; gfn_t last_gfn; int i; + bool flush = false; - memslot = id_to_memslot(kvm->memslots, slot); last_gfn = memslot->base_gfn + memslot->npages - 1; spin_lock(&kvm->mmu_lock); @@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) - __rmap_write_protect(kvm, rmapp, false); + flush |= __rmap_write_protect(kvm, rmapp, + false); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); @@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + unsigned long *rmapp; + unsigned long last_index, index; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_clear_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* + * It's also safe to flush TLBs out of mmu lock here as currently this + * function is only used for dirty logging, in which case flushing TLB + * out of mmu lock also guarantees no dirty pages will be lost in + * dirty_bitmap. + */ + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); + +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_write_protect(kvm, rmapp, + false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); + + /* see kvm_mmu_slot_remove_write_access */ + lockdep_assert_held(&kvm->slots_lock); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); + +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_set_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* see kvm_mmu_slot_leaf_clear_dirty */ + if (flush) + kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bde8ee725754..c7d65637c851 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,18 +44,6 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 - -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) - static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -81,9 +69,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41dd0387cccb..cc618c882f90 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (cr4 & X86_CR4_VMXE) @@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_mmu(vcpu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; @@ -3649,11 +3649,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_hwapic_isr_update(struct kvm *kvm, int isr) -{ - return; -} - static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) { return; @@ -4403,7 +4398,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_isr_update = svm_hwapic_isr_update, .sync_pir_to_irr = svm_sync_pir_to_irr, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c2a34bb5ad93..7c7bc8bef21f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc, #endif /* CONFIG_X86_64 */ +/* + * Tracepoint for PML full VMEXIT. + */ +TRACE_EVENT(kvm_pml_full, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %d: PML full", __entry->vcpu_id) +); + TRACE_EVENT(kvm_ple_window, TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), TP_ARGS(grow, vcpu_id, new, old), @@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update, __entry->flags) ); +TRACE_EVENT(kvm_wait_lapic_expire, + TP_PROTO(unsigned int vcpu_id, s64 delta), + TP_ARGS(vcpu_id, delta), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( s64, delta ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->delta = delta; + ), + + TP_printk("vcpu %u: delta %lld (%s)", + __entry->vcpu_id, + __entry->delta, + __entry->delta < 0 ? "early" : "late") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4c58d884838..ae4f6d35d19c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> +#include <asm/apic.h> #include "trace.h" @@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO); static u64 __read_mostly host_xss; +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -215,7 +219,12 @@ struct __packed vmcs12 { u64 tsc_offset; u64 virtual_apic_page_addr; u64 apic_access_addr; + u64 posted_intr_desc_addr; u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -330,6 +339,7 @@ struct __packed vmcs12 { u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; + u16 posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector; u16 guest_ss_selector; @@ -338,6 +348,7 @@ struct __packed vmcs12 { u16 guest_gs_selector; u16 guest_ldtr_selector; u16 guest_tr_selector; + u16 guest_intr_status; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -401,6 +412,10 @@ struct nested_vmx { */ struct page *apic_access_page; struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -408,6 +423,23 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; }; #define POSTED_INTR_ON 0 @@ -511,6 +543,10 @@ struct vcpu_vmx { /* Dynamic PLE window. */ int ple_window; bool ple_window_dirty; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; }; enum segment_cache_field { @@ -594,6 +630,7 @@ static int max_shadow_read_write_fields = static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), FIELD(GUEST_SS_SELECTOR, guest_ss_selector), @@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_GS_SELECTOR, guest_gs_selector), FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(TSC_OFFSET, tsc_offset), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); +static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; @@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) SECONDARY_EXEC_SHADOW_VMCS; } +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2108,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { unsigned long *msr_bitmap; - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_guest_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_nested; + else if (irqchip_in_kernel(vcpu->kvm) && + apic_x2apic_mode(vcpu->arch.apic)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -2284,20 +2347,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). - * TODO: allow these variables to be modified (downgraded) by module options - * or other means. */ -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; -static u32 nested_vmx_true_procbased_ctls_low; -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; -static u32 nested_vmx_true_exit_ctls_low; -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; -static u32 nested_vmx_true_entry_ctls_low; -static u32 nested_vmx_misc_low, nested_vmx_misc_high; -static u32 nested_vmx_ept_caps; -static __init void nested_vmx_setup_ctls_msrs(void) +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -2316,57 +2367,74 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); + vmx->nested.nested_vmx_pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_POSTED_INTR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); - nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); + vmx->nested.nested_vmx_exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_exit_ctls_high &= + vmx->nested.nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (vmx_mpx_supported()) - nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + vmx->nested.nested_vmx_true_exit_ctls_low = + vmx->nested.nested_vmx_exit_ctls_low & ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_entry_ctls_high &= + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); + vmx->nested.nested_vmx_entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | - VM_ENTRY_LOAD_IA32_EFER); + vmx->nested.nested_vmx_entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (vmx_mpx_supported()) - nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + vmx->nested.nested_vmx_true_entry_ctls_low = + vmx->nested.nested_vmx_entry_ctls_low & ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_procbased_ctls_high &= + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); + vmx->nested.nested_vmx_procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2386,45 +2454,58 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_true_procbased_ctls_low = + vmx->nested.nested_vmx_procbased_ctls_low & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); - nested_vmx_secondary_ctls_low = 0; - nested_vmx_secondary_ctls_high &= + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); + vmx->nested.nested_vmx_secondary_ctls_low = 0; + vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; - nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT; + vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; - nested_vmx_ept_caps &= vmx_capability.ept; + vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific * for single context invalidation. Hence, only advertise * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else - nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_ept_caps = 0; + + if (enable_unrestricted_guest) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + rdmsr(MSR_IA32_VMX_MISC, + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + vmx->nested.nested_vmx_misc_low |= + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - nested_vmx_misc_high = 0; + vmx->nested.nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2443,6 +2524,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high) /* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + switch (msr_index) { case MSR_IA32_VMX_BASIC: /* @@ -2457,36 +2540,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, - nested_vmx_pinbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr(nested_vmx_misc_low, - nested_vmx_misc_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -2511,12 +2602,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, - nested_vmx_secondary_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps; break; default: return 1; @@ -2785,7 +2877,7 @@ static int hardware_enable(void) u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old, test_bits; - if (read_cr4() & X86_CR4_VMXE) + if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); @@ -2812,7 +2904,7 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + cr4_set_bits(X86_CR4_VMXE); if (vmm_exclusive) { kvm_cpu_vmxon(phys_addr); @@ -2849,7 +2941,7 @@ static void hardware_disable(void) vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -2929,7 +3021,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4159,6 +4252,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, } } +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) { + WARN_ON(1); + return; + } + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) @@ -4197,6 +4336,74 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (vmx->nested.pi_desc && + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return 0; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) + return 0; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { + WARN_ON(1); + return -ENOMEM; + } + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + return 0; +} + +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + return true; + } +#endif + return false; +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* the PIR and ON have been set by L1. */ + kvm_vcpu_trigger_posted_interrupt(vcpu); + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + return -1; +} /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt @@ -4209,17 +4416,16 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) struct vcpu_vmx *vmx = to_vmx(vcpu); int r; + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return; + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; r = pi_test_and_set_on(&vmx->pi_desc); kvm_make_request(KVM_REQ_EVENT, vcpu); -#ifdef CONFIG_SMP - if (!r && (vcpu->mode == IN_GUEST_MODE)) - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); - else -#endif + if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) kvm_vcpu_kick(vcpu); } @@ -4255,7 +4461,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ - cr4 = read_cr4(); + cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->host_state.vmcs_host_cr4 = cr4; @@ -4360,6 +4566,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* PML is enabled/disabled in creating/destorying vcpu */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + return exec_control; } @@ -4986,11 +5195,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long always_on = VMXON_CR0_ALWAYSON; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) always_on &= ~(X86_CR0_PE | X86_CR0_PG); @@ -5015,7 +5225,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vmcs12, val)) + if (!nested_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5817,13 +6027,21 @@ static __init int hardware_setup(void) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out4; + + if (nested) { + vmx_msr_bitmap_nested = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_nested) + goto out5; + } + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out5; + goto out6; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out6; + goto out7; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -5839,10 +6057,12 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + if (nested) + memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out7; + goto out8; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -5868,16 +6088,16 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) { + if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) kvm_x86_ops->set_apic_access_page_addr = NULL; - } if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -5895,13 +6115,11 @@ static __init int hardware_setup(void) kvm_x86_ops->update_cr8_intercept = NULL; else { kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->hwapic_isr_update = NULL; kvm_x86_ops->deliver_posted_interrupt = NULL; kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; } - if (nested) - nested_vmx_setup_ctls_msrs(); - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -5945,12 +6163,29 @@ static __init int hardware_setup(void) update_ple_window_actual_max(); + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + return alloc_kvm_area(); -out7: +out8: free_page((unsigned long)vmx_vmwrite_bitmap); -out6: +out7: free_page((unsigned long)vmx_vmread_bitmap); +out6: + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: @@ -5977,6 +6212,8 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6143,6 +6380,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); +} + static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = @@ -6432,6 +6676,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); vmcs_write64(VMCS_LINK_POINTER, -1ull); } + vmx->nested.posted_intr_nv = -1; kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -6460,6 +6705,12 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } nested_free_all_saved_vmcss(vmx); } @@ -6893,6 +7144,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type; gva_t gva; @@ -6901,8 +7153,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6918,7 +7171,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, @@ -6960,6 +7213,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; } +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7008,6 +7286,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_invvpid, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, }; static const int kvm_vmx_max_exit_handlers = @@ -7275,6 +7554,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* apic_write and eoi_induced should exit unconditionally. */ + return 1; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is @@ -7314,6 +7597,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static int vmx_enable_pml(struct vcpu_vmx *vmx) +{ + struct page *pml_pg; + u32 exec_control; + + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!pml_pg) + return -ENOMEM; + + vmx->pml_pg = pml_pg; + + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + + return 0; +} + +static void vmx_disable_pml(struct vcpu_vmx *vmx) +{ + u32 exec_control; + + ASSERT(vmx->pml_pg); + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +} + +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7324,6 +7690,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vmx); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -7471,9 +7847,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) u16 status; u8 old; - if (!vmx_vm_has_apicv(kvm)) - return; - if (isr == -1) isr = 0; @@ -7784,7 +8157,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - cr4 = read_cr4(); + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); vmx->host_state.vmcs_host_cr4 = cr4; @@ -7973,6 +8346,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_pml) + vmx_disable_pml(vmx); free_vpid(vmx); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8040,9 +8415,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + if (nested) + nested_vmx_setup_ctls_msrs(vmx); + + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + err = vmx_enable_pml(vmx); + if (err) + goto free_vmcs; + } + return &vmx->vcpu; free_vmcs: @@ -8184,9 +8575,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, - nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.nested_vmx_ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -8199,6 +8591,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -8206,8 +8610,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -8261,6 +8664,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, return false; } + if (nested_cpu_has_posted_intr(vmcs12)) { + if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) + return false; + + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + } + vmx->nested.pi_desc_page = + nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); + if (!vmx->nested.pi_desc_page) + return false; + + vmx->nested.pi_desc = + (struct pi_desc *)kmap(vmx->nested.pi_desc_page); + if (!vmx->nested.pi_desc) { + nested_release_page_clean(vmx->nested.pi_desc_page); + return false; + } + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + return true; } @@ -8286,6 +8714,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + u64 addr; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { + WARN_ON(1); + return -EINVAL; + } + maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || + ((addr + PAGE_SIZE) >> maxphyaddr)) + return -EINVAL; + + return 0; +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!page) { + WARN_ON(1); + return false; + } + msr_bitmap = (unsigned long *)kmap(page); + if (!msr_bitmap) { + nested_release_page_clean(page); + WARN_ON(1); + return false; + } + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) + for (msr = 0x800; msr <= 0x8ff; msr++) + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + msr, MSR_TYPE_R); + /* TPR is allowed */ + nested_vmx_disable_intercept_for_msr(msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { + /* EOI and self-IPI are allowed */ + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + } else { + /* + * Enable reading intercept of all the x2apic + * MSRs. We should not rely on vmcs12 to do any + * optimizations here, it may have been modified + * by L1. + */ + for (msr = 0x800; msr <= 0x8ff; msr++) + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + msr, + MSR_TYPE_R); + + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + kunmap(page); + nested_release_page_clean(page); + + return true; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + vmcs12->posted_intr_nv & 0xff00)) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field, + int maxphyaddr) +{ + u64 count, addr; + + if (vmcs12_read_any(vcpu, count_field, &count) || + vmcs12_read_any(vcpu, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_warn_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + return -EINVAL; + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + if (kvm_get_msr(vcpu, e.index, &e.value)) { + pr_warn_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_write_guest(vcpu->kvm, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &e.value, sizeof(e.value))) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + return -EINVAL; + } + } + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -8365,8 +9097,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | - PIN_BASED_POSTED_INTR); + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + if (nested_cpu_has_posted_intr(vmcs12)) { + /* + * Note that we use L0's vector here and in + * vmx_deliver_nested_posted_interrupt. + */ + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } else + exec_control &= ~PIN_BASED_POSTED_INTR; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -8423,12 +9170,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); - } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); } + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + vmcs_write64(EOI_EXIT_BITMAP0, + vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, + vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, + vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, + vmcs12->eoi_exit_bitmap3); + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -8462,11 +9223,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); } + if (cpu_has_vmx_msr_bitmap() && + exec_control & CPU_BASED_USE_MSR_BITMAPS) { + nested_vmx_merge_msr_bitmap(vcpu, vmcs12); + /* MSR_BITMAP will be set by following vmx_set_efer. */ + } else + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + /* - * Merging of IO and MSR bitmaps not currently supported. + * Merging of IO bitmap not currently supported. * Rather, exit every time. */ - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; @@ -8582,6 +9349,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) int cpu; struct loaded_vmcs *vmcs02; bool ia32e; + u32 msr_entry_idx; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -8616,41 +9384,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !PAGE_ALIGNED(vmcs12->msr_bitmap)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - /*TODO: Also verify bits beyond physical address width are 0*/ + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high) || + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high) || + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high)) + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8663,7 +9432,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || + if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -8739,10 +9508,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - vmcs12->launch_state = 1; - prepare_vmcs02(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); @@ -8869,9 +9649,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; } - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -8981,6 +9762,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); @@ -9172,6 +9956,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* @@ -9193,6 +9984,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -9235,6 +10030,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } /* * We are now running in L2, mmu_notifier will force to reload the @@ -9301,6 +10102,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -9409,6 +10235,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_nested_events = vmx_check_nested_events, .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c259814200bd..32bf19ef3115 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ +unsigned int lapic_timer_advance_ns = 0; +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); + static bool backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_window", VCPU_STAT(irq_window_exits) }, { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, +static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) { return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, @@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk) } #endif +void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +{ + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. This function is only called from + * the physical CPU that is running vcpu. + */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +} static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; +static unsigned long max_tsc_khz; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { @@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; @@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed; + && !backwards_tsc_observed + && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { u64 gpa_offset; + struct kvm_arch *ka = &vcpu->kvm->arch; + kvmclock_reset(vcpu); + if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { + bool tmp = (msr == MSR_KVM_SYSTEM_TIME); + + if (ka->boot_vcpu_runs_old_kvmclock != tmp) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); + + ka->boot_vcpu_runs_old_kvmclock = tmp; + } + vcpu->arch.time = data; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); @@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +EXPORT_SYMBOL_GPL(kvm_get_msr); static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { @@ -2716,7 +2744,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: @@ -2738,6 +2765,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: + case KVM_CAP_TSC_DEADLINE_TIMER: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2776,9 +2804,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; default: r = 0; break; @@ -3734,83 +3759,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, * @kvm: kvm instance * @log: slot id and address to which we copy the log * - * We need to keep it in mind that VCPU threads can write to the bitmap - * concurrently. So, to avoid losing data, we keep the following order for - * each bit: + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. - * 3. Flush TLB's if needed. - * 4. Copy the snapshot to the userspace. - * - * Between 2 and 3, the guest may write to the page using the remaining TLB - * entry. This is not a problem because the page will be reported dirty at - * step 4 using the snapshot taken before and step 3 ensures that successive - * writes will be logged for the next call. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - struct kvm_memory_slot *memslot; - unsigned long n, i; - unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_buffer; bool is_dirty = false; + int r; mutex_lock(&kvm->slots_lock); - r = -EINVAL; - if (log->slot >= KVM_USER_MEM_SLOTS) - goto out; - - memslot = id_to_memslot(kvm->memslots, log->slot); - - dirty_bitmap = memslot->dirty_bitmap; - r = -ENOENT; - if (!dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); - - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); - memset(dirty_bitmap_buffer, 0, n); - - spin_lock(&kvm->mmu_lock); - - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; - - if (!dirty_bitmap[i]) - continue; - - is_dirty = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; - - offset = i * BITS_PER_LONG; - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); - } - - spin_unlock(&kvm->mmu_lock); + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); - /* See the comments in kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); if (is_dirty) kvm_flush_remote_tlbs(kvm); - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - goto out; - - r = 0; -out: mutex_unlock(&kvm->slots_lock); return r; } @@ -4516,6 +4501,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, if (rc != X86EMUL_CONTINUE) return rc; addr += now; + if (ctxt->mode != X86EMUL_MODE_PROT64) + addr = (u32)addr; val += now; bytes -= now; } @@ -4984,6 +4971,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon kvm_register_write(emul_to_vcpu(ctxt), reg, val); } +static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) +{ + kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5019,6 +5011,7 @@ static const struct x86_emulate_ops emulate_ops = { .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .set_nmi_mask = emulator_set_nmi_mask, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6311,6 +6304,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* @@ -7041,15 +7035,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - int r; struct msr_data msr; struct kvm *kvm = vcpu->kvm; - r = vcpu_load(vcpu); - if (r) - return r; + if (vcpu_load(vcpu)) + return; msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; @@ -7058,8 +7050,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); - - return r; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -7549,12 +7539,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } +static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Still write protect RO slot */ + if (new->flags & KVM_MEM_READONLY) { + kvm_mmu_slot_remove_write_access(kvm, new); + return; + } + + /* + * Call kvm_x86_ops dirty logging hooks when they are valid. + * + * kvm_x86_ops->slot_disable_log_dirty is called when: + * + * - KVM_MR_CREATE with dirty logging is disabled + * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag + * + * The reason is, in case of PML, we need to set D-bit for any slots + * with dirty logging disabled in order to eliminate unnecessary GPA + * logging in PML buffer (and potential PML buffer full VMEXT). This + * guarantees leaving PML enabled during guest's lifetime won't have + * any additonal overhead from PML when guest is running with dirty + * logging disabled for memory slots. + * + * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * to dirty logging mode. + * + * If kvm_x86_ops dirty logging hooks are invalid, use write protect. + * + * In case of write protect: + * + * Write protect all pages for dirty logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). + */ + if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_x86_ops->slot_enable_log_dirty) + kvm_x86_ops->slot_enable_log_dirty(kvm, new); + else + kvm_mmu_slot_remove_write_access(kvm, new); + } else { + if (kvm_x86_ops->slot_disable_log_dirty) + kvm_x86_ops->slot_disable_log_dirty(kvm, new); + } +} + void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, enum kvm_mr_change change) { - + struct kvm_memory_slot *new; int nr_mmu_pages = 0; if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { @@ -7573,17 +7613,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + + /* It's OK to get 'new' slot here as it has already been installed */ + new = id_to_memslot(kvm->memslots, mem->slot); + /* - * Write protect all pages for dirty logging. + * Set up write protection and/or dirty logging for the new slot. * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. - * - * See the comments in fast_page_fault(). + * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have + * been zapped so no dirty logging staff is needed for old slot. For + * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the + * new and it's also covered when dealing with the new slot. */ - if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_slot_remove_write_access(kvm, mem->slot); + if (change != KVM_MR_DELETE) + kvm_mmu_slot_apply_flags(kvm, new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7837,3 +7880,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cc1d61af6140..f5fef1868096 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int lapic_timer_advance_ns; + extern struct static_key kvm_no_apic_vcpu; #endif |