summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-02-17 13:39:28 +0300
committerIngo Molnar <mingo@kernel.org>2018-02-17 13:39:28 +0300
commit7057bb975dab827997e0ca9dd92cafef0856b0cc (patch)
tree7784dc59c03f25b6bc4fa5cc12d5b61cb8b53765 /arch/x86
parent33ea4b24277b06dbc55d7f5772a46f029600255e (diff)
parent297f9233b53a08fd457815e19f1d6f2c3389857b (diff)
downloadlinux-7057bb975dab827997e0ca9dd92cafef0856b0cc.tar.xz
Merge branch 'perf/urgent' into perf/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore1
-rw-r--r--arch/x86/Kconfig89
-rw-r--r--arch/x86/boot/compressed/eboot.c1
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S199
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c70
-rw-r--r--arch/x86/crypto/chacha20_glue.c1
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c1
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c1
-rw-r--r--arch/x86/crypto/poly1305_glue.c2
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S184
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S114
-rw-r--r--arch/x86/crypto/salsa20_glue.c105
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S112
-rw-r--r--arch/x86/entry/calling.h107
-rw-r--r--arch/x86/entry/common.c15
-rw-r--r--arch/x86/entry/entry_32.S8
-rw-r--r--arch/x86/entry/entry_64.S210
-rw-r--r--arch/x86/entry/entry_64_compat.S30
-rw-r--r--arch/x86/entry/syscall_64.c7
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/events/intel/lbr.c2
-rw-r--r--arch/x86/events/intel/p6.c2
-rw-r--r--arch/x86/hyperv/hv_init.c144
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/barrier.h28
-rw-r--r--arch/x86/include/asm/bug.h19
-rw-r--r--arch/x86/include/asm/compat.h86
-rw-r--r--arch/x86/include/asm/cpufeature.h79
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/dma-direct.h30
-rw-r--r--arch/x86/include/asm/dma-mapping.h29
-rw-r--r--arch/x86/include/asm/error-injection.h13
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/fpu/signal.h6
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/intel-family.h6
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h6
-rw-r--r--arch/x86/include/asm/iosf_mbi.h25
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/kasan.h12
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h22
-rw-r--r--arch/x86/include/asm/mshyperv.h36
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h3
-rw-r--r--arch/x86/include/asm/nospec-branch.h16
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/paravirt.h4
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/pat.h2
-rw-r--r--arch/x86/include/asm/pgtable-3level.h37
-rw-r--r--arch/x86/include/asm/pgtable.h15
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pmc_core.h27
-rw-r--r--arch/x86/include/asm/processor.h17
-rw-r--r--arch/x86/include/asm/ptrace.h5
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/swiotlb.h2
-rw-r--r--arch/x86/include/asm/sync_core.h28
-rw-r--r--arch/x86/include/asm/syscall.h6
-rw-r--r--arch/x86/include/asm/thread_info.h3
-rw-r--r--arch/x86/include/asm/tlbflush.h29
-rw-r--r--arch/x86/include/asm/uaccess.h15
-rw-r--r--arch/x86/include/asm/uaccess_32.h6
-rw-r--r--arch/x86/include/asm/uaccess_64.h12
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h27
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h4
-rw-r--r--arch/x86/include/uapi/asm/poll.h1
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/alternative.c14
-rw-r--r--arch/x86/kernel/amd_gart_64.c1
-rw-r--r--arch/x86/kernel/amd_nb.c2
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c15
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c94
-rw-r--r--arch/x86/kernel/cpu/bugs.c118
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/common.c35
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c58
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c24
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/devicetree.c6
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/early-quirks.c87
-rw-r--r--arch/x86/kernel/head_32.S4
-rw-r--r--arch/x86/kernel/irq.c9
-rw-r--r--arch/x86/kernel/kvm.c49
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/pci-dma.c23
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c8
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S8
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/signal_compat.c123
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kvm/Kconfig8
-rw-r--r--arch/x86/kvm/cpuid.c44
-rw-r--r--arch/x86/kvm/cpuid.h1
-rw-r--r--arch/x86/kvm/emulate.c62
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/lapic.c25
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c36
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/svm.c1315
-rw-r--r--arch/x86/kvm/vmx.c1393
-rw-r--r--arch/x86/kvm/vmx_shadow_fields.h77
-rw-r--r--arch/x86/kvm/x86.c346
-rw-r--r--arch/x86/kvm/x86.h33
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/cpu.c2
-rw-r--r--arch/x86/lib/error-inject.c20
-rw-r--r--arch/x86/lib/getuser.S10
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/mm/init_32.c9
-rw-r--r--arch/x86/mm/init_64.c100
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--arch/x86/mm/mmio-mod.c5
-rw-r--r--arch/x86/mm/pat.c19
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/tlb.c45
-rw-r--r--arch/x86/net/bpf_jit_comp.c106
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/pci/sta2x11-fixup.c1
-rw-r--r--arch/x86/pci/xen.c4
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c19
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/power/hibernate_32.c2
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/xen/mmu_pv.c6
-rw-r--r--arch/x86/xen/p2m.c6
-rw-r--r--arch/x86/xen/xen-head.S16
154 files changed, 4261 insertions, 2370 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index aff152c87cf4..5a82bac5e0bc 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,6 +1,7 @@
boot/compressed/vmlinux
tools/test_get_len
tools/insn_sanity
+tools/insn_decoder_test
purgatory/kexec-purgatory.c
purgatory/purgatory.ro
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 423e4b64e683..a528c14d45a5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -54,6 +54,8 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_PHYS_TO_DMA
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
@@ -61,6 +63,7 @@ config X86
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if X86_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -68,7 +71,6 @@ config X86
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_SUPPORTS_ATOMIC_RMW
- select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
@@ -116,6 +118,7 @@ config X86
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
@@ -154,6 +157,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+ select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
@@ -320,7 +324,7 @@ config X86_64_SMP
config X86_32_LAZY_GS
def_bool y
- depends on X86_32 && !CC_STACKPROTECTOR
+ depends on X86_32 && CC_STACKPROTECTOR_NONE
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -419,12 +423,6 @@ config X86_MPPARSE
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
- ---help---
- This option is needed for the systems that have more than 8 CPUs
-
config GOLDFISH
def_bool y
depends on X86_GOLDFISH
@@ -456,6 +454,12 @@ config INTEL_RDT
Say N if unsure.
if X86_32
+config X86_BIGSMP
+ bool "Support for big SMP systems with more than 8 CPUs"
+ depends on SMP
+ ---help---
+ This option is needed for the systems that have more than 8 CPUs
+
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -945,25 +949,66 @@ config MAXSMP
Enable maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
+#
+# The maximum number of CPUs supported:
+#
+# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT,
+# and which can be configured interactively in the
+# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
+#
+# The ranges are different on 32-bit and 64-bit kernels, depending on
+# hardware capabilities and scalability features of the kernel.
+#
+# ( If MAXSMP is enabled we just use the highest possible value and disable
+# interactive configuration. )
+#
+
+config NR_CPUS_RANGE_BEGIN
+ int
+ default NR_CPUS_RANGE_END if MAXSMP
+ default 1 if !SMP
+ default 2
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_32
+ default 64 if SMP && X86_BIGSMP
+ default 8 if SMP && !X86_BIGSMP
+ default 1 if !SMP
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_64
+ default 8192 if SMP && ( MAXSMP || CPUMASK_OFFSTACK)
+ default 512 if SMP && (!MAXSMP && !CPUMASK_OFFSTACK)
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_32
+ default 32 if X86_BIGSMP
+ default 8 if SMP
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_64
+ default 8192 if MAXSMP
+ default 64 if SMP
+ default 1 if !SMP
+
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
- range 2 8 if SMP && X86_32 && !X86_BIGSMP
- range 2 64 if SMP && X86_32 && X86_BIGSMP
- range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
- range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
- default "1" if !SMP
- default "8192" if MAXSMP
- default "32" if SMP && X86_BIGSMP
- default "8" if SMP && X86_32
- default "64" if SMP
+ range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
+ default NR_CPUS_DEFAULT
---help---
This allows you to specify the maximum number of CPUs which this
kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum
supported value is 8192, otherwise the maximum value is 512. The
minimum value which makes sense is 2.
- This is purely to save memory - each supported CPU adds
- approximately eight kilobytes to the kernel image.
+ This is purely to save memory: each supported CPU adds about 8KB
+ to the kernel image.
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
@@ -1264,9 +1309,9 @@ config MICROCODE
CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
initrd for microcode blobs.
- In addition, you can build-in the microcode into the kernel. For that you
- need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
- to the CONFIG_EXTRA_FIRMWARE config option.
+ In addition, you can build the microcode into the kernel. For that you
+ need to add the vendor-supplied microcode to the CONFIG_EXTRA_FIRMWARE
+ config option.
config MICROCODE_INTEL
bool "Intel microcode loading support"
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index e56dbc67e837..353e20c3f114 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -999,6 +999,7 @@ struct boot_params *efi_main(struct efi_config *c,
/* Ask the firmware to clear memory on unclean shutdown */
efi_enable_reset_attack_mitigation(sys_table);
+ efi_retrieve_tpm2_eventlog(sys_table);
setup_graphics(boot_params);
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3d09e3aca18d..12e8484a8ee7 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -90,30 +90,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
-.section .rodata
-.align 16
-.type aad_shift_arr, @object
-.size aad_shift_arr, 272
-aad_shift_arr:
- .octa 0xffffffffffffffffffffffffffffffff
- .octa 0xffffffffffffffffffffffffffffff0C
- .octa 0xffffffffffffffffffffffffffff0D0C
- .octa 0xffffffffffffffffffffffffff0E0D0C
- .octa 0xffffffffffffffffffffffff0F0E0D0C
- .octa 0xffffffffffffffffffffff0C0B0A0908
- .octa 0xffffffffffffffffffff0D0C0B0A0908
- .octa 0xffffffffffffffffff0E0D0C0B0A0908
- .octa 0xffffffffffffffff0F0E0D0C0B0A0908
- .octa 0xffffffffffffff0C0B0A090807060504
- .octa 0xffffffffffff0D0C0B0A090807060504
- .octa 0xffffffffff0E0D0C0B0A090807060504
- .octa 0xffffffff0F0E0D0C0B0A090807060504
- .octa 0xffffff0C0B0A09080706050403020100
- .octa 0xffff0D0C0B0A09080706050403020100
- .octa 0xff0E0D0C0B0A09080706050403020100
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-
.text
@@ -257,6 +233,37 @@ aad_shift_arr:
pxor \TMP1, \GH # result is in TMP1
.endm
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN and XMM1
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ MOVQ_R64_XMM %rax, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ MOVQ_R64_XMM %rax, \XMM1
+ pslldq $8, \XMM1
+ por \XMM1, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ MOVQ_R64_XMM %rax, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
@@ -273,62 +280,30 @@ aad_shift_arr:
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
MOVADQ SHUF_MASK(%rip), %xmm14
mov arg7, %r10 # %r10 = AAD
- mov arg8, %r12 # %r12 = aadLen
- mov %r12, %r11
+ mov arg8, %r11 # %r11 = aadLen
pxor %xmm\i, %xmm\i
pxor \XMM2, \XMM2
cmp $16, %r11
- jl _get_AAD_rest8\num_initial_blocks\operation
+ jl _get_AAD_rest\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add $16, %r10
- sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\num_initial_blocks\operation
movdqu \XMM2, %xmm\i
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation
- pxor %xmm\i,%xmm\i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
- cmp $4, %r11
- jle _get_AAD_rest4\num_initial_blocks\operation
- movq (%r10), \TMP1
- add $8, %r10
- sub $8, %r11
- pslldq $8, \TMP1
- psrldq $8, %xmm\i
- pxor \TMP1, %xmm\i
- jmp _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
- cmp $0, %r11
- jle _get_AAD_rest0\num_initial_blocks\operation
- mov (%r10), %eax
- movq %rax, \TMP1
- add $4, %r10
- sub $4, %r10
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \TMP1
- PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -532,62 +507,30 @@ _initial_blocks_done\num_initial_blocks\operation:
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
MOVADQ SHUF_MASK(%rip), %xmm14
mov arg7, %r10 # %r10 = AAD
- mov arg8, %r12 # %r12 = aadLen
- mov %r12, %r11
+ mov arg8, %r11 # %r11 = aadLen
pxor %xmm\i, %xmm\i
pxor \XMM2, \XMM2
cmp $16, %r11
- jl _get_AAD_rest8\num_initial_blocks\operation
+ jl _get_AAD_rest\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add $16, %r10
- sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\num_initial_blocks\operation
movdqu \XMM2, %xmm\i
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation
- pxor %xmm\i,%xmm\i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some PT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
- cmp $4, %r11
- jle _get_AAD_rest4\num_initial_blocks\operation
- movq (%r10), \TMP1
- add $8, %r10
- sub $8, %r11
- pslldq $8, \TMP1
- psrldq $8, %xmm\i
- pxor \TMP1, %xmm\i
- jmp _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
- cmp $0, %r11
- jle _get_AAD_rest0\num_initial_blocks\operation
- mov (%r10), %eax
- movq %rax, \TMP1
- add $4, %r10
- sub $4, %r10
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \TMP1
- PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -1386,14 +1329,6 @@ _esb_loop_\@:
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
@@ -1487,19 +1422,16 @@ _zero_cipher_left_decrypt:
PSHUFB_XMM %xmm10, %xmm0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12
-# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
-# (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
+ lea (%arg3,%r11,1), %r10
+ mov %r13, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+ lea ALL_F+16(%rip), %r12
+ sub %r13, %r12
movdqa %xmm1, %xmm2
pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm2
@@ -1508,9 +1440,6 @@ _zero_cipher_left_decrypt:
pxor %xmm2, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
# output %r13 bytes
MOVQ_R64_XMM %xmm0, %rax
@@ -1664,14 +1593,6 @@ ENDPROC(aesni_gcm_dec)
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
@@ -1764,19 +1685,16 @@ _zero_cipher_left_encrypt:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
-
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
- lea SHIFT_MASK+16(%rip), %r12
+
+ lea (%arg3,%r11,1), %r10
+ mov %r13, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+ lea ALL_F+16(%rip), %r12
sub %r13, %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-r13 bytes of xmm0
pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
movdqa SHUF_MASK(%rip), %xmm10
@@ -1785,9 +1703,6 @@ _zero_cipher_left_encrypt:
pxor %xmm0, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
-
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3bf3dcf29825..34cf1c1f8c98 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -690,8 +690,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
-static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
- unsigned int key_len)
+static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
+ unsigned int key_len)
{
struct cryptd_aead **ctx = crypto_aead_ctx(parent);
struct cryptd_aead *cryptd_tfm = *ctx;
@@ -716,8 +716,8 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
/* This is the Integrity Check Value (aka the authentication tag length and can
* be 8, 12 or 16 bytes long. */
-static int rfc4106_set_authsize(struct crypto_aead *parent,
- unsigned int authsize)
+static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
+ unsigned int authsize)
{
struct cryptd_aead **ctx = crypto_aead_ctx(parent);
struct cryptd_aead *cryptd_tfm = *ctx;
@@ -824,7 +824,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length <= PAGE_SIZE) &&
- sg_is_last(req->dst) &&
+ sg_is_last(req->dst) && req->dst->length &&
(!PageHighMem(sg_page(req->dst)) ||
req->dst->offset + req->dst->length <= PAGE_SIZE)) {
one_entry_in_sg = 1;
@@ -929,7 +929,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
aes_ctx);
}
-static int rfc4106_encrypt(struct aead_request *req)
+static int gcmaes_wrapper_encrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
@@ -945,7 +945,7 @@ static int rfc4106_encrypt(struct aead_request *req)
return crypto_aead_encrypt(req);
}
-static int rfc4106_decrypt(struct aead_request *req)
+static int gcmaes_wrapper_decrypt(struct aead_request *req)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
@@ -1117,7 +1117,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
{
__be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
@@ -1128,6 +1128,30 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
aes_ctx);
}
+static int generic_gcmaes_init(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void generic_gcmaes_exit(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
static struct aead_alg aesni_aead_algs[] = { {
.setkey = common_rfc4106_set_key,
.setauthsize = common_rfc4106_set_authsize,
@@ -1147,10 +1171,10 @@ static struct aead_alg aesni_aead_algs[] = { {
}, {
.init = rfc4106_init,
.exit = rfc4106_exit,
- .setkey = rfc4106_set_key,
- .setauthsize = rfc4106_set_authsize,
- .encrypt = rfc4106_encrypt,
- .decrypt = rfc4106_decrypt,
+ .setkey = gcmaes_wrapper_set_key,
+ .setauthsize = gcmaes_wrapper_set_authsize,
+ .encrypt = gcmaes_wrapper_encrypt,
+ .decrypt = gcmaes_wrapper_decrypt,
.ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
@@ -1170,13 +1194,31 @@ static struct aead_alg aesni_aead_algs[] = { {
.ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
+ .cra_name = "__generic-gcm-aes-aesni",
+ .cra_driver_name = "__driver-generic-gcm-aes-aesni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
+}, {
+ .init = generic_gcmaes_init,
+ .exit = generic_gcmaes_exit,
+ .setkey = gcmaes_wrapper_set_key,
+ .setauthsize = gcmaes_wrapper_set_authsize,
+ .encrypt = gcmaes_wrapper_encrypt,
+ .decrypt = gcmaes_wrapper_decrypt,
+ .ivsize = GCM_AES_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
.cra_name = "gcm(aes)",
.cra_driver_name = "generic-gcm-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
.cra_module = THIS_MODULE,
},
} };
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 1e6af1b35f7b..dce7c5d39c2f 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -107,7 +107,6 @@ static struct skcipher_alg alg = {
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha20_ctx),
- .base.cra_alignmask = sizeof(u32) - 1,
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA20_KEY_SIZE,
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 27226df3f7d8..c8d9cdacbf10 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -162,6 +162,7 @@ static struct shash_alg alg = {
.cra_name = "crc32",
.cra_driver_name = "crc32-pclmul",
.cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.cra_blocksize = CHKSUM_BLOCK_SIZE,
.cra_ctxsize = sizeof(u32),
.cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index c194d5717ae5..5773e1161072 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -226,6 +226,7 @@ static struct shash_alg alg = {
.cra_name = "crc32c",
.cra_driver_name = "crc32c-intel",
.cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.cra_blocksize = CHKSUM_BLOCK_SIZE,
.cra_ctxsize = sizeof(u32),
.cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index e32142bc071d..790377797544 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -164,14 +164,12 @@ static struct shash_alg alg = {
.init = poly1305_simd_init,
.update = poly1305_simd_update,
.final = crypto_poly1305_final,
- .setkey = crypto_poly1305_setkey,
.descsize = sizeof(struct poly1305_simd_desc_ctx),
.base = {
.cra_name = "poly1305",
.cra_driver_name = "poly1305-simd",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
- .cra_alignmask = sizeof(u32) - 1,
.cra_blocksize = POLY1305_BLOCK_SIZE,
.cra_module = THIS_MODULE,
},
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
index 329452b8f794..6014b7b9e52a 100644
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -1,6 +1,7 @@
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
+# Derived from:
+# salsa20_pm.s version 20051229
+# D. J. Bernstein
+# Public domain.
#include <linux/linkage.h>
@@ -935,180 +936,3 @@ ENTRY(salsa20_encrypt_bytes)
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # k = arg2
- movl 8(%esp,%eax),%ecx
- # kbits = arg3
- movl 12(%esp,%eax),%edx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in1 = *(uint32 *) (k + 0)
- movl 0(%ecx),%ebx
- # in2 = *(uint32 *) (k + 4)
- movl 4(%ecx),%esi
- # in3 = *(uint32 *) (k + 8)
- movl 8(%ecx),%edi
- # in4 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ebp
- # *(uint32 *) (x + 4) = in1
- movl %ebx,4(%eax)
- # *(uint32 *) (x + 8) = in2
- movl %esi,8(%eax)
- # *(uint32 *) (x + 12) = in3
- movl %edi,12(%eax)
- # *(uint32 *) (x + 16) = in4
- movl %ebp,16(%eax)
- # kbits - 256
- cmp $256,%edx
- # goto kbits128 if unsigned<
- jb ._kbits128
-._kbits256:
- # in11 = *(uint32 *) (k + 16)
- movl 16(%ecx),%edx
- # in12 = *(uint32 *) (k + 20)
- movl 20(%ecx),%ebx
- # in13 = *(uint32 *) (k + 24)
- movl 24(%ecx),%esi
- # in14 = *(uint32 *) (k + 28)
- movl 28(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 857760878
- mov $857760878,%edx
- # in10 = 2036477234
- mov $2036477234,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
- # goto keysetupdone
- jmp ._keysetupdone
-._kbits128:
- # in11 = *(uint32 *) (k + 0)
- movl 0(%ecx),%edx
- # in12 = *(uint32 *) (k + 4)
- movl 4(%ecx),%ebx
- # in13 = *(uint32 *) (k + 8)
- movl 8(%ecx),%esi
- # in14 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 824206446
- mov $824206446,%edx
- # in10 = 2036477238
- mov $2036477238,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
-._keysetupdone:
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # iv = arg2
- movl 8(%esp,%eax),%ecx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in6 = *(uint32 *) (iv + 0)
- movl 0(%ecx),%edx
- # in7 = *(uint32 *) (iv + 4)
- movl 4(%ecx),%ecx
- # in8 = 0
- mov $0,%ebx
- # in9 = 0
- mov $0,%esi
- # *(uint32 *) (x + 24) = in6
- movl %edx,24(%eax)
- # *(uint32 *) (x + 28) = in7
- movl %ecx,28(%eax)
- # *(uint32 *) (x + 32) = in8
- movl %ebx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %esi,36(%eax)
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
index 10db30d58006..03a4918f41ee 100644
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -803,117 +803,3 @@ ENTRY(salsa20_encrypt_bytes)
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # k = arg2
- mov %rsi,%rsi
- # kbits = arg3
- mov %rdx,%rdx
- # x = arg1
- mov %rdi,%rdi
- # in0 = *(uint64 *) (k + 0)
- movq 0(%rsi),%r8
- # in2 = *(uint64 *) (k + 8)
- movq 8(%rsi),%r9
- # *(uint64 *) (x + 4) = in0
- movq %r8,4(%rdi)
- # *(uint64 *) (x + 12) = in2
- movq %r9,12(%rdi)
- # unsigned<? kbits - 256
- cmp $256,%rdx
- # comment:fp stack unchanged by jump
- # goto kbits128 if unsigned<
- jb ._kbits128
-# kbits256:
-._kbits256:
- # in10 = *(uint64 *) (k + 16)
- movq 16(%rsi),%rdx
- # in12 = *(uint64 *) (k + 24)
- movq 24(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 857760878
- mov $857760878,%rdx
- # in10 = 2036477234
- mov $2036477234,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
- # comment:fp stack unchanged by jump
- # goto keysetupdone
- jmp ._keysetupdone
-# kbits128:
-._kbits128:
- # in10 = *(uint64 *) (k + 0)
- movq 0(%rsi),%rdx
- # in12 = *(uint64 *) (k + 8)
- movq 8(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 824206446
- mov $824206446,%rdx
- # in10 = 2036477238
- mov $2036477238,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
-# keysetupdone:
-._keysetupdone:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # iv = arg2
- mov %rsi,%rsi
- # x = arg1
- mov %rdi,%rdi
- # in6 = *(uint64 *) (iv + 0)
- movq 0(%rsi),%rsi
- # in8 = 0
- mov $0,%r8
- # *(uint64 *) (x + 24) = in6
- movq %rsi,24(%rdi)
- # *(uint64 *) (x + 32) = in8
- movq %r8,32(%rdi)
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index cb91a64a99e7..b07d7d959806 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -11,6 +11,9 @@
* - x86-64 version, renamed as salsa20-x86_64-asm_64.S
* available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
*
+ * Also modified to set up the initial state using the generic C code rather
+ * than in assembly.
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
@@ -18,93 +21,65 @@
*
*/
-#include <crypto/algapi.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/salsa20.h>
#include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE 8U
-#define SALSA20_MIN_KEY_SIZE 16U
-#define SALSA20_MAX_KEY_SIZE 32U
-
-struct salsa20_ctx
-{
- u32 input[16];
-};
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
- u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
- const u8 *src, u8 *dst, u32 bytes);
+asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
+ u32 bytes);
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keysize)
+static int salsa20_asm_crypt(struct skcipher_request *req)
{
- struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
- salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
- return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
-{
- struct blkcipher_walk walk;
- struct crypto_blkcipher *tfm = desc->tfm;
- struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ u32 state[16];
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, 64);
+ err = skcipher_walk_virt(&walk, req, true);
- salsa20_ivsetup(ctx, walk.iv);
+ crypto_salsa20_init(state, ctx, walk.iv);
- while (walk.nbytes >= 64) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes - (walk.nbytes % 64));
- err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
- }
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
- if (walk.nbytes) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr, walk.nbytes);
- err = blkcipher_walk_done(desc, &walk, 0);
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ salsa20_encrypt_bytes(state, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
-static struct crypto_alg alg = {
- .cra_name = "salsa20",
- .cra_driver_name = "salsa20-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_type = &crypto_blkcipher_type,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct salsa20_ctx),
- .cra_alignmask = 3,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .setkey = setkey,
- .encrypt = encrypt,
- .decrypt = encrypt,
- .min_keysize = SALSA20_MIN_KEY_SIZE,
- .max_keysize = SALSA20_MAX_KEY_SIZE,
- .ivsize = SALSA20_IV_SIZE,
- }
- }
+static struct skcipher_alg alg = {
+ .base.cra_name = "salsa20",
+ .base.cra_driver_name = "salsa20-asm",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct salsa20_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = SALSA20_MIN_KEY_SIZE,
+ .max_keysize = SALSA20_MAX_KEY_SIZE,
+ .ivsize = SALSA20_IV_SIZE,
+ .chunksize = SALSA20_BLOCK_SIZE,
+ .setkey = crypto_salsa20_setkey,
+ .encrypt = salsa20_asm_crypt,
+ .decrypt = salsa20_asm_crypt,
};
static int __init init(void)
{
- return crypto_register_alg(&alg);
+ return crypto_register_skcipher(&alg);
}
static void __exit fini(void)
{
- crypto_unregister_alg(&alg);
+ crypto_unregister_skcipher(&alg);
}
module_init(init);
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
index 36870b26067a..d08805032f01 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
@@ -57,10 +57,12 @@ void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
{
unsigned int j;
- state->lens[0] = 0;
- state->lens[1] = 1;
- state->lens[2] = 2;
- state->lens[3] = 3;
+ /* initially all lanes are unused */
+ state->lens[0] = 0xFFFFFFFF00000000;
+ state->lens[1] = 0xFFFFFFFF00000001;
+ state->lens[2] = 0xFFFFFFFF00000002;
+ state->lens[3] = 0xFFFFFFFF00000003;
+
state->unused_lanes = 0xFF03020100;
for (j = 0; j < 4; j++)
state->ldata[j].job_in_lane = NULL;
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index 1c3b7ceb36d2..e7273a606a07 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -55,29 +55,31 @@
#define RAB1bl %bl
#define RAB2bl %cl
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
#define RCD0 %r8
#define RCD1 %r9
#define RCD2 %r10
-#define RCD0d %r8d
-#define RCD1d %r9d
-#define RCD2d %r10d
-
-#define RX0 %rbp
-#define RX1 %r11
-#define RX2 %r12
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
-#define RX0d %ebp
-#define RX1d %r11d
-#define RX2d %r12d
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
-#define RY0 %r13
-#define RY1 %r14
-#define RY2 %r15
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
-#define RY0d %r13d
-#define RY1d %r14d
-#define RY2d %r15d
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
#define RT0 %rdx
#define RT1 %rsi
@@ -85,6 +87,8 @@
#define RT0d %edx
#define RT1d %esi
+#define RT1bl %sil
+
#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
movzbl ab ## bl, tmp2 ## d; \
movzbl ab ## bh, tmp1 ## d; \
@@ -92,6 +96,11 @@
op1##l T0(CTX, tmp2, 4), dst ## d; \
op2##l T1(CTX, tmp1, 4), dst ## d;
+#define swap_ab_with_cd(ab, cd, tmp) \
+ movq cd, tmp; \
+ movq ab, cd; \
+ movq tmp, ab;
+
/*
* Combined G1 & G2 function. Reordered with help of rotates to have moves
* at begining.
@@ -110,15 +119,15 @@
/* G1,2 && G2,2 */ \
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
- xchgq cd ## 0, ab ## 0; \
+ swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
\
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
- xchgq cd ## 1, ab ## 1; \
+ swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
\
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
- xchgq cd ## 2, ab ## 2;
+ swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
#define enc_round_end(ab, x, y, n) \
addl y ## d, x ## d; \
@@ -168,6 +177,16 @@
decrypt_round3(ba, dc, (n*2)+1); \
decrypt_round3(ba, dc, (n*2));
+#define push_cd() \
+ pushq RCD2; \
+ pushq RCD1; \
+ pushq RCD0;
+
+#define pop_cd() \
+ popq RCD0; \
+ popq RCD1; \
+ popq RCD2;
+
#define inpack3(in, n, xy, m) \
movq 4*(n)(in), xy ## 0; \
xorq w+4*m(CTX), xy ## 0; \
@@ -223,11 +242,8 @@ ENTRY(__twofish_enc_blk_3way)
* %rdx: src, RIO
* %rcx: bool, if true: xor output
*/
- pushq %r15;
- pushq %r14;
pushq %r13;
pushq %r12;
- pushq %rbp;
pushq %rbx;
pushq %rcx; /* bool xor */
@@ -235,40 +251,36 @@ ENTRY(__twofish_enc_blk_3way)
inpack_enc3();
- encrypt_cycle3(RAB, RCD, 0);
- encrypt_cycle3(RAB, RCD, 1);
- encrypt_cycle3(RAB, RCD, 2);
- encrypt_cycle3(RAB, RCD, 3);
- encrypt_cycle3(RAB, RCD, 4);
- encrypt_cycle3(RAB, RCD, 5);
- encrypt_cycle3(RAB, RCD, 6);
- encrypt_cycle3(RAB, RCD, 7);
+ push_cd();
+ encrypt_cycle3(RAB, CD, 0);
+ encrypt_cycle3(RAB, CD, 1);
+ encrypt_cycle3(RAB, CD, 2);
+ encrypt_cycle3(RAB, CD, 3);
+ encrypt_cycle3(RAB, CD, 4);
+ encrypt_cycle3(RAB, CD, 5);
+ encrypt_cycle3(RAB, CD, 6);
+ encrypt_cycle3(RAB, CD, 7);
+ pop_cd();
popq RIO; /* dst */
- popq %rbp; /* bool xor */
+ popq RT1; /* bool xor */
- testb %bpl, %bpl;
+ testb RT1bl, RT1bl;
jnz .L__enc_xor3;
outunpack_enc3(mov);
popq %rbx;
- popq %rbp;
popq %r12;
popq %r13;
- popq %r14;
- popq %r15;
ret;
.L__enc_xor3:
outunpack_enc3(xor);
popq %rbx;
- popq %rbp;
popq %r12;
popq %r13;
- popq %r14;
- popq %r15;
ret;
ENDPROC(__twofish_enc_blk_3way)
@@ -278,35 +290,31 @@ ENTRY(twofish_dec_blk_3way)
* %rsi: dst
* %rdx: src, RIO
*/
- pushq %r15;
- pushq %r14;
pushq %r13;
pushq %r12;
- pushq %rbp;
pushq %rbx;
pushq %rsi; /* dst */
inpack_dec3();
- decrypt_cycle3(RAB, RCD, 7);
- decrypt_cycle3(RAB, RCD, 6);
- decrypt_cycle3(RAB, RCD, 5);
- decrypt_cycle3(RAB, RCD, 4);
- decrypt_cycle3(RAB, RCD, 3);
- decrypt_cycle3(RAB, RCD, 2);
- decrypt_cycle3(RAB, RCD, 1);
- decrypt_cycle3(RAB, RCD, 0);
+ push_cd();
+ decrypt_cycle3(RAB, CD, 7);
+ decrypt_cycle3(RAB, CD, 6);
+ decrypt_cycle3(RAB, CD, 5);
+ decrypt_cycle3(RAB, CD, 4);
+ decrypt_cycle3(RAB, CD, 3);
+ decrypt_cycle3(RAB, CD, 2);
+ decrypt_cycle3(RAB, CD, 1);
+ decrypt_cycle3(RAB, CD, 0);
+ pop_cd();
popq RIO; /* dst */
outunpack_dec3();
popq %rbx;
- popq %rbp;
popq %r12;
popq %r13;
- popq %r14;
- popq %r15;
ret;
ENDPROC(twofish_dec_blk_3way)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3f48f695d5e6..dce7092ab24a 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,80 +97,69 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro ALLOC_PT_GPREGS_ON_STACK
- addq $-(15*8), %rsp
- .endm
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+ /*
+ * Push registers and sanitize registers of values that a
+ * speculation attack might otherwise want to exploit. The
+ * lower registers are likely clobbered well before they
+ * could be put to use in a speculative execution gadget.
+ * Interleave XOR with PUSH for better uop scheduling:
+ */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq \rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq \rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ xorq %r8, %r8 /* nospec r8 */
+ pushq %r9 /* pt_regs->r9 */
+ xorq %r9, %r9 /* nospec r9 */
+ pushq %r10 /* pt_regs->r10 */
+ xorq %r10, %r10 /* nospec r10 */
+ pushq %r11 /* pt_regs->r11 */
+ xorq %r11, %r11 /* nospec r11*/
+ pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx*/
+ pushq %rbp /* pt_regs->rbp */
+ xorl %ebp, %ebp /* nospec rbp*/
+ pushq %r12 /* pt_regs->r12 */
+ xorq %r12, %r12 /* nospec r12*/
+ pushq %r13 /* pt_regs->r13 */
+ xorq %r13, %r13 /* nospec r13*/
+ pushq %r14 /* pt_regs->r14 */
+ xorq %r14, %r14 /* nospec r14*/
+ pushq %r15 /* pt_regs->r15 */
+ xorq %r15, %r15 /* nospec r15*/
+ UNWIND_HINT_REGS
+.endm
- .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
- .if \r11
- movq %r11, 6*8+\offset(%rsp)
- .endif
- .if \r8910
- movq %r10, 7*8+\offset(%rsp)
- movq %r9, 8*8+\offset(%rsp)
- movq %r8, 9*8+\offset(%rsp)
- .endif
- .if \rax
- movq %rax, 10*8+\offset(%rsp)
- .endif
- .if \rcx
- movq %rcx, 11*8+\offset(%rsp)
- .endif
- movq %rdx, 12*8+\offset(%rsp)
- movq %rsi, 13*8+\offset(%rsp)
- movq %rdi, 14*8+\offset(%rsp)
- UNWIND_HINT_REGS offset=\offset extra=0
- .endm
- .macro SAVE_C_REGS offset=0
- SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
- .endm
- .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
- SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
- .endm
- .macro SAVE_C_REGS_EXCEPT_R891011
- SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
- .endm
- .macro SAVE_C_REGS_EXCEPT_RCX_R891011
- SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
- .endm
- .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
- SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
- .endm
-
- .macro SAVE_EXTRA_REGS offset=0
- movq %r15, 0*8+\offset(%rsp)
- movq %r14, 1*8+\offset(%rsp)
- movq %r13, 2*8+\offset(%rsp)
- movq %r12, 3*8+\offset(%rsp)
- movq %rbp, 4*8+\offset(%rsp)
- movq %rbx, 5*8+\offset(%rsp)
- UNWIND_HINT_REGS offset=\offset
- .endm
-
- .macro POP_EXTRA_REGS
+.macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .endm
-
- .macro POP_C_REGS
+ .if \skip_r11rcx
+ popq %rsi
+ .else
popq %r11
+ .endif
popq %r10
popq %r9
popq %r8
popq %rax
+ .if \skip_r11rcx
+ popq %rsi
+ .else
popq %rcx
+ .endif
popq %rdx
popq %rsi
+ .if \pop_rdi
popq %rdi
- .endm
-
- .macro icebp
- .byte 0xf1
- .endm
+ .endif
+.endm
/*
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
@@ -178,7 +167,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d7d3cc24baf4..74f6eee15179 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
+#include <linux/nospec.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>
@@ -153,6 +154,9 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
if (cached_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
+ if (cached_flags & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
/* deal with pending signal delivery */
if (cached_flags & _TIF_SIGPENDING)
do_signal(regs);
@@ -165,9 +169,6 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
- if (cached_flags & _TIF_PATCH_PENDING)
- klp_update_patch_state(current);
-
/* Disable IRQs and retry */
local_irq_disable();
@@ -206,7 +207,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
* special case only applies after poking regs and before the
* very next return to user mode.
*/
- current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif
user_enter_irqoff();
@@ -282,7 +283,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
* regs->orig_ax, which changes the behavior of some syscalls.
*/
if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
- regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+ nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
+ regs->ax = sys_call_table[nr](
regs->di, regs->si, regs->dx,
regs->r10, regs->r8, regs->r9);
}
@@ -304,7 +306,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
unsigned int nr = (unsigned int)regs->orig_ax;
#ifdef CONFIG_IA32_EMULATION
- current->thread.status |= TS_COMPAT;
+ ti->status |= TS_COMPAT;
#endif
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
@@ -318,6 +320,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
}
if (likely(nr < IA32_NR_syscalls)) {
+ nr = array_index_nospec(nr, IA32_NR_syscalls);
/*
* It's possible that a 32-bit syscall implementation
* takes a 64-bit parameter but nonetheless assumes that
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2a35b1e0fb90..16c2c022540d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -566,6 +566,11 @@ restore_all:
.Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
.Lirq_return:
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
+ */
INTERRUPT_RETURN
.section .fixup, "ax"
@@ -895,6 +900,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
hyperv_vector_handler)
+BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
+ hyperv_reenlightenment_intr)
+
#endif /* CONFIG_HYPERV */
ENTRY(page_fault)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a83570495162..8971bd64d515 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -213,7 +213,7 @@ ENTRY(entry_SYSCALL_64)
swapgs
/*
- * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+ * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch)
@@ -227,100 +227,15 @@ ENTRY(entry_SYSCALL_64)
pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq %r8 /* pt_regs->r8 */
- pushq %r9 /* pt_regs->r9 */
- pushq %r10 /* pt_regs->r10 */
- pushq %r11 /* pt_regs->r11 */
- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
- UNWIND_HINT_REGS extra=0
- TRACE_IRQS_OFF
-
- /*
- * If we need to do entry work or if we guess we'll need to do
- * exit work, go straight to the slow path.
- */
- movq PER_CPU_VAR(current_task), %r11
- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
- jnz entry_SYSCALL64_slow_path
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
-entry_SYSCALL_64_fastpath:
- /*
- * Easy case: enable interrupts and issue the syscall. If the syscall
- * needs pt_regs, we'll call a stub that disables interrupts again
- * and jumps to the slow path.
- */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max, %rax
-#else
- andl $__SYSCALL_MASK, %eax
- cmpl $__NR_syscall_max, %eax
-#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10, %rcx
-
- /*
- * This call instruction is handled specially in stub_ptregs_64.
- * It might end up jumping to the slow path. If it jumps, RAX
- * and all argument registers are clobbered.
- */
-#ifdef CONFIG_RETPOLINE
- movq sys_call_table(, %rax, 8), %rax
- call __x86_indirect_thunk_rax
-#else
- call *sys_call_table(, %rax, 8)
-#endif
-.Lentry_SYSCALL_64_after_fastpath_call:
-
- movq %rax, RAX(%rsp)
-1:
-
- /*
- * If we get here, then we know that pt_regs is clean for SYSRET64.
- * If we see that no exit work is required (which we are required
- * to check with IRQs off), then we can go straight to SYSRET64.
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movq PER_CPU_VAR(current_task), %r11
- testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
- jnz 1f
-
- LOCKDEP_SYS_EXIT
- TRACE_IRQS_ON /* user mode is traced as IRQs on */
- movq RIP(%rsp), %rcx
- movq EFLAGS(%rsp), %r11
- addq $6*8, %rsp /* skip extra regs -- they were preserved */
- UNWIND_HINT_EMPTY
- jmp .Lpop_c_regs_except_rcx_r11_and_sysret
-
-1:
- /*
- * The fast path looked good when we started, but something changed
- * along the way and we need to switch to the slow path. Calling
- * raise(3) will trigger this, for example. IRQs are off.
- */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- SAVE_EXTRA_REGS
- movq %rsp, %rdi
- call syscall_return_slowpath /* returns with IRQs disabled */
- jmp return_from_SYSCALL_64
-entry_SYSCALL64_slow_path:
/* IRQs are off. */
- SAVE_EXTRA_REGS
movq %rsp, %rdi
call do_syscall_64 /* returns with IRQs disabled */
-return_from_SYSCALL_64:
TRACE_IRQS_IRETQ /* we're about to change IF */
/*
@@ -392,16 +307,7 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
UNWIND_HINT_EMPTY
- POP_EXTRA_REGS
-.Lpop_c_regs_except_rcx_r11_and_sysret:
- popq %rsi /* skip r11 */
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rsi /* skip rcx */
- popq %rdx
- popq %rsi
+ POP_REGS pop_rdi=0 skip_r11rcx=1
/*
* Now all regs are restored except RSP and RDI.
@@ -424,47 +330,6 @@ syscall_return_via_sysret:
USERGS_SYSRET64
END(entry_SYSCALL_64)
-ENTRY(stub_ptregs_64)
- /*
- * Syscalls marked as needing ptregs land here.
- * If we are on the fast path, we need to save the extra regs,
- * which we achieve by trying again on the slow path. If we are on
- * the slow path, the extra regs are already saved.
- *
- * RAX stores a pointer to the C function implementing the syscall.
- * IRQs are on.
- */
- cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
- jne 1f
-
- /*
- * Called from fast path -- disable IRQs again, pop return address
- * and jump to slow path
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- popq %rax
- UNWIND_HINT_REGS extra=0
- jmp entry_SYSCALL64_slow_path
-
-1:
- JMP_NOSPEC %rax /* Called from C */
-END(stub_ptregs_64)
-
-.macro ptregs_stub func
-ENTRY(ptregs_\func)
- UNWIND_HINT_FUNC
- leaq \func(%rip), %rax
- jmp stub_ptregs_64
-END(ptregs_\func)
-.endm
-
-/* Instantiate ptregs_stub for each ptregs-using syscall */
-#define __SYSCALL_64_QUAL_(sym)
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
-#include <asm/syscalls_64.h>
-
/*
* %rdi: prev task
* %rsi: next task
@@ -672,9 +537,7 @@ END(irq_entries_start)
call switch_to_thread_stack
1:
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
+ PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
testb $3, CS(%rsp)
@@ -735,15 +598,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
- POP_EXTRA_REGS
- popq %r11
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rcx
- popq %rdx
- popq %rsi
+ POP_REGS pop_rdi=0
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
@@ -801,9 +656,12 @@ GLOBAL(restore_regs_and_return_to_kernel)
ud2
1:
#endif
- POP_EXTRA_REGS
- POP_C_REGS
+ POP_REGS
addq $8, %rsp /* skip regs->orig_ax */
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler.
+ */
INTERRUPT_RETURN
ENTRY(native_iret)
@@ -1017,7 +875,9 @@ ENTRY(\sym)
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- ALLOC_PT_GPREGS_ON_STACK
+ /* Save all registers in pt_regs */
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
.if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
@@ -1230,9 +1090,7 @@ ENTRY(xen_failsafe_callback)
addq $0x30, %rsp
UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
+ PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
@@ -1245,6 +1103,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
#if IS_ENABLED(CONFIG_HYPERV)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
+
+apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
+ hyperv_reenlightenment_vector hyperv_reenlightenment_intr
#endif /* CONFIG_HYPERV */
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
@@ -1269,16 +1130,13 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
- ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1317,21 +1175,18 @@ ENTRY(paranoid_exit)
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
.Lparanoid_exit_restore:
jmp restore_regs_and_return_to_kernel
END(paranoid_exit)
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
- UNWIND_HINT_FUNC
+ UNWIND_HINT_REGS offset=8
cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
- ENCODE_FRAME_POINTER 8
- xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1512,22 +1367,7 @@ ENTRY(nmi)
pushq 1*8(%rdx) /* pt_regs->rip */
UNWIND_HINT_IRET_REGS
pushq $-1 /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq (%rdx) /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq %rax /* pt_regs->ax */
- pushq %r8 /* pt_regs->r8 */
- pushq %r9 /* pt_regs->r9 */
- pushq %r10 /* pt_regs->r10 */
- pushq %r11 /* pt_regs->r11 */
- pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
- pushq %r12 /* pt_regs->r12 */
- pushq %r13 /* pt_regs->r13 */
- pushq %r14 /* pt_regs->r14 */
- pushq %r15 /* pt_regs->r15 */
- UNWIND_HINT_REGS
+ PUSH_AND_CLEAR_REGS rdx=(%rdx)
ENCODE_FRAME_POINTER
/*
@@ -1737,7 +1577,8 @@ end_repeat_nmi:
* frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
- ALLOC_PT_GPREGS_ON_STACK
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
@@ -1761,8 +1602,7 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- POP_EXTRA_REGS
- POP_C_REGS
+ POP_REGS
/*
* Skip orig_ax and the "outermost" frame to point RSP at the "iret"
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 98d5358e4041..fd65e016e413 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -85,15 +85,25 @@ ENTRY(entry_SYSENTER_compat)
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq $0 /* pt_regs->r8 = 0 */
+ xorq %r8, %r8 /* nospec r8 */
pushq $0 /* pt_regs->r9 = 0 */
+ xorq %r9, %r9 /* nospec r9 */
pushq $0 /* pt_regs->r10 = 0 */
+ xorq %r10, %r10 /* nospec r10 */
pushq $0 /* pt_regs->r11 = 0 */
+ xorq %r11, %r11 /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ xorl %ebp, %ebp /* nospec rbp */
pushq $0 /* pt_regs->r12 = 0 */
+ xorq %r12, %r12 /* nospec r12 */
pushq $0 /* pt_regs->r13 = 0 */
+ xorq %r13, %r13 /* nospec r13 */
pushq $0 /* pt_regs->r14 = 0 */
+ xorq %r14, %r14 /* nospec r14 */
pushq $0 /* pt_regs->r15 = 0 */
+ xorq %r15, %r15 /* nospec r15 */
cld
/*
@@ -214,15 +224,25 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq %rbp /* pt_regs->cx (stashed in bp) */
pushq $-ENOSYS /* pt_regs->ax */
pushq $0 /* pt_regs->r8 = 0 */
+ xorq %r8, %r8 /* nospec r8 */
pushq $0 /* pt_regs->r9 = 0 */
+ xorq %r9, %r9 /* nospec r9 */
pushq $0 /* pt_regs->r10 = 0 */
+ xorq %r10, %r10 /* nospec r10 */
pushq $0 /* pt_regs->r11 = 0 */
+ xorq %r11, %r11 /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ xorl %ebp, %ebp /* nospec rbp */
pushq $0 /* pt_regs->r12 = 0 */
+ xorq %r12, %r12 /* nospec r12 */
pushq $0 /* pt_regs->r13 = 0 */
+ xorq %r13, %r13 /* nospec r13 */
pushq $0 /* pt_regs->r14 = 0 */
+ xorq %r14, %r14 /* nospec r14 */
pushq $0 /* pt_regs->r15 = 0 */
+ xorq %r15, %r15 /* nospec r15 */
/*
* User mode is traced as though IRQs are on, and SYSENTER
@@ -338,15 +358,25 @@ ENTRY(entry_INT80_compat)
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq $0 /* pt_regs->r8 = 0 */
+ xorq %r8, %r8 /* nospec r8 */
pushq $0 /* pt_regs->r9 = 0 */
+ xorq %r9, %r9 /* nospec r9 */
pushq $0 /* pt_regs->r10 = 0 */
+ xorq %r10, %r10 /* nospec r10 */
pushq $0 /* pt_regs->r11 = 0 */
+ xorq %r11, %r11 /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
pushq %rbp /* pt_regs->rbp */
+ xorl %ebp, %ebp /* nospec rbp */
pushq %r12 /* pt_regs->r12 */
+ xorq %r12, %r12 /* nospec r12 */
pushq %r13 /* pt_regs->r13 */
+ xorq %r13, %r13 /* nospec r13 */
pushq %r14 /* pt_regs->r14 */
+ xorq %r14, %r14 /* nospec r14 */
pushq %r15 /* pt_regs->r15 */
+ xorq %r15, %r15 /* nospec r15 */
cld
/*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9c09775e589d..c176d2fab1da 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -7,14 +7,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_64_QUAL_(sym) sym
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
-
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 731153a4681e..56457cb73448 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3559,7 +3559,7 @@ static int intel_snb_pebs_broken(int cpu)
break;
case INTEL_FAM6_SANDYBRIDGE_X:
- switch (cpu_data(cpu).x86_mask) {
+ switch (cpu_data(cpu).x86_stepping) {
case 6: rev = 0x618; break;
case 7: rev = 0x70c; break;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index ae64d0b69729..cf372b90557e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1186,7 +1186,7 @@ void __init intel_pmu_lbr_init_atom(void)
* on PMU interrupt
*/
if (boot_cpu_data.x86_model == 28
- && boot_cpu_data.x86_mask < 10) {
+ && boot_cpu_data.x86_stepping < 10) {
pr_cont("LBR disabled due to erratum");
return;
}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index a5604c352930..408879b0c0d4 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -234,7 +234,7 @@ static __initconst const struct x86_pmu p6_pmu = {
static __init void p6_pmu_rdpmc_quirk(void)
{
- if (boot_cpu_data.x86_mask < 9) {
+ if (boot_cpu_data.x86_stepping < 9) {
/*
* PPro erratum 26; fixed in stepping 9 and above.
*/
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 189a398290db..2edc49e7409b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -18,6 +18,8 @@
*/
#include <linux/types.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
@@ -37,6 +39,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{
return tsc_pg;
}
+EXPORT_SYMBOL_GPL(hv_get_tsc_page);
static u64 read_hv_clock_tsc(struct clocksource *arg)
{
@@ -101,6 +104,115 @@ static int hv_cpu_init(unsigned int cpu)
return 0;
}
+static void (*hv_reenlightenment_cb)(void);
+
+static void hv_reenlightenment_notify(struct work_struct *dummy)
+{
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ /* Don't issue the callback if TSC accesses are not emulated */
+ if (hv_reenlightenment_cb && emu_status.inprogress)
+ hv_reenlightenment_cb();
+}
+static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
+
+void hyperv_stop_tsc_emulation(void)
+{
+ u64 freq;
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+ emu_status.inprogress = 0;
+ wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+ tsc_khz = div64_u64(freq, 1000);
+}
+EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
+
+static inline bool hv_reenlightenment_available(void)
+{
+ /*
+ * Check for required features and priviliges to make TSC frequency
+ * change notifications work.
+ */
+ return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE &&
+ ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT;
+}
+
+__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs)
+{
+ entering_ack_irq();
+
+ inc_irq_stat(irq_hv_reenlightenment_count);
+
+ schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
+
+ exiting_irq();
+}
+
+void set_hv_tscchange_cb(void (*cb)(void))
+{
+ struct hv_reenlightenment_control re_ctrl = {
+ .vector = HYPERV_REENLIGHTENMENT_VECTOR,
+ .enabled = 1,
+ .target_vp = hv_vp_index[smp_processor_id()]
+ };
+ struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
+
+ if (!hv_reenlightenment_available()) {
+ pr_warn("Hyper-V: reenlightenment support is unavailable\n");
+ return;
+ }
+
+ hv_reenlightenment_cb = cb;
+
+ /* Make sure callback is registered before we write to MSRs */
+ wmb();
+
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+}
+EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
+
+void clear_hv_tscchange_cb(void)
+{
+ struct hv_reenlightenment_control re_ctrl;
+
+ if (!hv_reenlightenment_available())
+ return;
+
+ rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+ re_ctrl.enabled = 0;
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+
+ hv_reenlightenment_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb);
+
+static int hv_cpu_die(unsigned int cpu)
+{
+ struct hv_reenlightenment_control re_ctrl;
+ unsigned int new_cpu;
+
+ if (hv_reenlightenment_cb == NULL)
+ return 0;
+
+ rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ if (re_ctrl.target_vp == hv_vp_index[cpu]) {
+ /* Reassign to some other online CPU */
+ new_cpu = cpumask_any_but(cpu_online_mask, cpu);
+
+ re_ctrl.target_vp = hv_vp_index[new_cpu];
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ }
+
+ return 0;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -110,12 +222,19 @@ static int hv_cpu_init(unsigned int cpu)
*/
void hyperv_init(void)
{
- u64 guest_id;
+ u64 guest_id, required_msrs;
union hv_x64_msr_hypercall_contents hypercall_msr;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
+ /* Absolutely required MSRs */
+ required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE |
+ HV_X64_MSR_VP_INDEX_AVAILABLE;
+
+ if ((ms_hyperv.features & required_msrs) != required_msrs)
+ return;
+
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
@@ -123,7 +242,7 @@ void hyperv_init(void)
return;
if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
- hv_cpu_init, NULL) < 0)
+ hv_cpu_init, hv_cpu_die) < 0)
goto free_vp_index;
/*
@@ -239,17 +358,24 @@ void hyperv_report_panic(struct pt_regs *regs, long err)
}
EXPORT_SYMBOL_GPL(hyperv_report_panic);
-bool hv_is_hypercall_page_setup(void)
+bool hv_is_hyperv_initialized(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
- /* Check if the hypercall page is setup */
+ /*
+ * Ensure that we're really on Hyper-V, and not a KVM or Xen
+ * emulation of Hyper-V
+ */
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return false;
+
+ /*
+ * Verify that earlier initialization succeeded by checking
+ * that the hypercall page is setup
+ */
hypercall_msr.as_uint64 = 0;
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
- if (!hypercall_msr.enable)
- return false;
-
- return true;
+ return hypercall_msr.enable;
}
-EXPORT_SYMBOL_GPL(hv_is_hypercall_page_setup);
+EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 5d6a53fd7521..de690c2d2e33 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -6,7 +6,6 @@ generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
-generic-y += clkdev.h
generic-y += dma-contiguous.h
generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 44f5d79d5105..11881726ed37 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
if (boot_cpu_data.x86 == 0x0F &&
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86_model <= 0x05 &&
- boot_cpu_data.x86_mask < 0x0A)
+ boot_cpu_data.x86_stepping < 0x0A)
return 1;
else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E))
return 1;
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 7fb336210e1b..e1259f043ae9 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -24,6 +24,34 @@
#define wmb() asm volatile("sfence" ::: "memory")
#endif
+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ * bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ * 0 - (index < size)
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+ unsigned long size)
+{
+ unsigned long mask;
+
+ asm ("cmp %1,%2; sbb %0,%0;"
+ :"=r" (mask)
+ :"g"(size),"r" (index)
+ :"cc");
+ return mask;
+}
+
+/* Override the default implementation from linux/nospec.h. */
+#define array_index_mask_nospec array_index_mask_nospec
+
+/* Prevent speculative execution past this barrier. */
+#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
+ "lfence", X86_FEATURE_LFENCE_RDTSC)
+
#ifdef CONFIG_X86_PPRO_FENCE
#define dma_rmb() rmb()
#else
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 34d99af43994..6804d6642767 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -5,23 +5,20 @@
#include <linux/stringify.h>
/*
- * Since some emulators terminate on UD2, we cannot use it for WARN.
- * Since various instruction decoders disagree on the length of UD1,
- * we cannot use it either. So use UD0 for WARN.
+ * Despite that some emulators terminate on UD2, we use it for WARN().
*
- * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas
- * our kernel decoder thinks it takes a ModRM byte, which seems consistent
- * with various things like the Intel SDM instruction encoding rules)
+ * Since various instruction decoders/specs disagree on the encoding of
+ * UD0/UD1.
*/
-#define ASM_UD0 ".byte 0x0f, 0xff"
+#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
#define ASM_UD2 ".byte 0x0f, 0x0b"
#define INSN_UD0 0xff0f
#define INSN_UD2 0x0b0f
-#define LEN_UD0 2
+#define LEN_UD2 2
#ifdef CONFIG_GENERIC_BUG
@@ -77,7 +74,11 @@ do { \
unreachable(); \
} while (0)
-#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
+#define __WARN_FLAGS(flags) \
+do { \
+ _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \
+ annotate_reachable(); \
+} while (0)
#include <asm-generic/bug.h>
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 2cbd75dd2fd3..e1c8dab86670 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -127,88 +127,6 @@ typedef u32 compat_old_sigset_t; /* at least 32 bits */
typedef u32 compat_sigset_word;
-typedef union compat_sigval {
- compat_int_t sival_int;
- compat_uptr_t sival_ptr;
-} compat_sigval_t;
-
-typedef struct compat_siginfo {
- int si_signo;
- int si_errno;
- int si_code;
-
- union {
- int _pad[128/sizeof(int) - 3];
-
- /* kill() */
- struct {
- unsigned int _pid; /* sender's pid */
- unsigned int _uid; /* sender's uid */
- } _kill;
-
- /* POSIX.1b timers */
- struct {
- compat_timer_t _tid; /* timer id */
- int _overrun; /* overrun count */
- compat_sigval_t _sigval; /* same as below */
- int _sys_private; /* not to be passed to user */
- int _overrun_incr; /* amount to add to overrun */
- } _timer;
-
- /* POSIX.1b signals */
- struct {
- unsigned int _pid; /* sender's pid */
- unsigned int _uid; /* sender's uid */
- compat_sigval_t _sigval;
- } _rt;
-
- /* SIGCHLD */
- struct {
- unsigned int _pid; /* which child */
- unsigned int _uid; /* sender's uid */
- int _status; /* exit code */
- compat_clock_t _utime;
- compat_clock_t _stime;
- } _sigchld;
-
- /* SIGCHLD (x32 version) */
- struct {
- unsigned int _pid; /* which child */
- unsigned int _uid; /* sender's uid */
- int _status; /* exit code */
- compat_s64 _utime;
- compat_s64 _stime;
- } _sigchld_x32;
-
- /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
- struct {
- unsigned int _addr; /* faulting insn/memory ref. */
- short int _addr_lsb; /* Valid LSB of the reported address. */
- union {
- /* used when si_code=SEGV_BNDERR */
- struct {
- compat_uptr_t _lower;
- compat_uptr_t _upper;
- } _addr_bnd;
- /* used when si_code=SEGV_PKUERR */
- compat_u32 _pkey;
- };
- } _sigfault;
-
- /* SIGPOLL */
- struct {
- int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
- int _fd;
- } _sigpoll;
-
- struct {
- unsigned int _call_addr; /* calling insn */
- int _syscall; /* triggering system call number */
- unsigned int _arch; /* AUDIT_ARCH_* of syscall */
- } _sigsys;
- } _sifields;
-} compat_siginfo_t;
-
#define COMPAT_OFF_T_MAX 0x7fffffff
struct compat_ipc64_perm {
@@ -331,4 +249,8 @@ static inline bool in_compat_syscall(void)
}
#define in_compat_syscall in_compat_syscall /* override the generic impl */
+struct compat_siginfo;
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const siginfo_t *from, bool x32_ABI);
+
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 70eddb3922ff..736771c9822e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -148,45 +148,46 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline __pure bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P1\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P0\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
- : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
- [bitnum] "i" (1 << (bit & 7)),
- [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
- : : t_yes, t_no);
- t_yes:
- return true;
- t_no:
- return false;
+ asm_volatile_goto("1: jmp 6f\n"
+ "2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 4f - .\n" /* repl offset */
+ " .word %P[always]\n" /* always replace */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_replacement,\"ax\"\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
+ ".previous\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 0\n" /* no replacement */
+ " .word %P[feature]\n" /* feature bit */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum],%[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".previous\n"
+ : : [feature] "i" (bit),
+ [always] "i" (X86_FEATURE_ALWAYS),
+ [bitnum] "i" (1 << (bit & 7)),
+ [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ : : t_yes, t_no);
+t_yes:
+ return true;
+t_no:
+ return false;
}
#define static_cpu_has(bit) \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1d9199e1c2ad..0dfe4d3f74e2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -210,6 +210,7 @@
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
+#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
diff --git a/arch/x86/include/asm/dma-direct.h b/arch/x86/include/asm/dma-direct.h
new file mode 100644
index 000000000000..1295bc622ebe
--- /dev/null
+++ b/arch/x86/include/asm/dma-direct.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_DMA_DIRECT_H
+#define ASM_X86_DMA_DIRECT_H 1
+
+#include <linux/mem_encrypt.h>
+
+#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */
+bool dma_capable(struct device *dev, dma_addr_t addr, size_t size);
+dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
+#else
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+ if (!dev->dma_mask)
+ return 0;
+
+ return addr + size - 1 <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ return __sme_set(paddr);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ return __sme_clr(daddr);
+}
+#endif /* CONFIG_X86_DMA_REMAP */
+#endif /* ASM_X86_DMA_DIRECT_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0350d99bb8fd..6277c83c0eb1 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
#include <asm/io.h>
#include <asm/swiotlb.h>
#include <linux/dma-contiguous.h>
-#include <linux/mem_encrypt.h>
#ifdef CONFIG_ISA
# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -31,6 +30,9 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
return dma_ops;
}
+int arch_dma_supported(struct device *dev, u64 mask);
+#define arch_dma_supported arch_dma_supported
+
bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
#define arch_dma_alloc_attrs arch_dma_alloc_attrs
@@ -42,31 +44,6 @@ extern void dma_generic_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_addr,
unsigned long attrs);
-#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */
-extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size);
-extern dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
-extern phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
-#else
-
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
- if (!dev->dma_mask)
- return 0;
-
- return addr + size - 1 <= *dev->dma_mask;
-}
-
-static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
-{
- return __sme_set(paddr);
-}
-
-static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
-{
- return __sme_clr(daddr);
-}
-#endif /* CONFIG_X86_DMA_REMAP */
-
static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
gfp_t gfp)
{
diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h
new file mode 100644
index 000000000000..47b7a1296245
--- /dev/null
+++ b/arch/x86/include/asm/error-injection.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ERROR_INJECTION_H
+#define _ASM_ERROR_INJECTION_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm-generic/error-injection.h>
+
+asmlinkage void just_return_func(void);
+void override_function_with_return(struct pt_regs *regs);
+
+#endif /* _ASM_ERROR_INJECTION_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 64c4a30e0d39..e203169931c7 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -137,8 +137,10 @@ enum fixed_addresses {
extern void reserve_top_address(unsigned long reserve);
-#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
extern int fixmaps_set;
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 4df2754ef380..44bbc39a57b3 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -20,12 +20,6 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
# define ia32_setup_rt_frame __setup_rt_frame
#endif
-#ifdef CONFIG_COMPAT
-int __copy_siginfo_to_user32(compat_siginfo_t __user *to,
- const siginfo_t *from, bool x32_ABI);
-#endif
-
-
extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
struct task_struct *tsk);
extern void convert_to_fxsr(struct task_struct *tsk,
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 51cc979dd364..7c341a74ec8c 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -38,6 +38,9 @@ typedef struct {
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
unsigned int irq_hv_callback_count;
#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ unsigned int irq_hv_reenlightenment_count;
+#endif
} ____cacheline_aligned irq_cpustat_t;
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 35a6bc4da8ad..cf090e584202 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -10,6 +10,10 @@
*
* Things ending in "2" are usually because we have no better
* name for them. There's no processor called "SILVERMONT2".
+ *
+ * While adding a new CPUID for a new microarchitecture, add a new
+ * group to keep logically sorted out in chronological order. Within
+ * that group keep the CPUID for the variants sorted by model number.
*/
#define INTEL_FAM6_CORE_YONAH 0x0E
@@ -49,6 +53,8 @@
#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E
#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E
+#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
index 528ed4be4393..9e7adcdbe031 100644
--- a/arch/x86/include/asm/intel_pmc_ipc.h
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -38,6 +38,7 @@ int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen);
int intel_pmc_s0ix_counter_read(u64 *data);
int intel_pmc_gcr_read(u32 offset, u32 *data);
+int intel_pmc_gcr_read64(u32 offset, u64 *data);
int intel_pmc_gcr_write(u32 offset, u32 data);
int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val);
@@ -70,6 +71,11 @@ static inline int intel_pmc_gcr_read(u32 offset, u32 *data)
return -EINVAL;
}
+static inline int intel_pmc_gcr_read64(u32 offset, u64 *data)
+{
+ return -EINVAL;
+}
+
static inline int intel_pmc_gcr_write(u32 offset, u32 data)
{
return -EINVAL;
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index 7d87437bd030..3de0489deade 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -147,6 +147,18 @@ int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb);
int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb);
/**
+ * iosf_mbi_unregister_pmic_bus_access_notifier_unlocked - Unregister PMIC bus
+ * notifier, unlocked
+ *
+ * Like iosf_mbi_unregister_pmic_bus_access_notifier(), but for use when the
+ * caller has already called iosf_mbi_punit_acquire() itself.
+ *
+ * @nb: notifier_block to unregister
+ */
+int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+ struct notifier_block *nb);
+
+/**
* iosf_mbi_call_pmic_bus_access_notifier_chain - Call PMIC bus notifier chain
*
* @val: action to pass into listener's notifier_call function
@@ -154,6 +166,11 @@ int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb);
*/
int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v);
+/**
+ * iosf_mbi_assert_punit_acquired - Assert that the P-Unit has been acquired.
+ */
+void iosf_mbi_assert_punit_acquired(void);
+
#else /* CONFIG_IOSF_MBI is not enabled */
static inline
bool iosf_mbi_available(void)
@@ -197,12 +214,20 @@ int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb)
return 0;
}
+static inline int
+iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(struct notifier_block *nb)
+{
+ return 0;
+}
+
static inline
int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v)
{
return 0;
}
+static inline void iosf_mbi_assert_punit_acquired(void) {}
+
#endif /* CONFIG_IOSF_MBI */
#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 67421f649cfa..e71c1120426b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -103,7 +103,12 @@
#endif
#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
-#define LOCAL_TIMER_VECTOR 0xee
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#define HYPERV_REENLIGHTENMENT_VECTOR 0xee
+#endif
+
+#define LOCAL_TIMER_VECTOR 0xed
#define NR_VECTORS 256
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index b577dd0916aa..13e70da38bed 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -4,6 +4,7 @@
#include <linux/const.h>
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+#define KASAN_SHADOW_SCALE_SHIFT 3
/*
* Compiler uses shadow offset assuming that addresses start
@@ -12,12 +13,15 @@
* 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
*/
#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
- ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> \
+ KASAN_SHADOW_SCALE_SHIFT))
/*
- * 47 bits for kernel address -> (47 - 3) bits for shadow
- * 56 bits for kernel address -> (56 - 3) bits for shadow
+ * 47 bits for kernel address -> (47 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
+ * 56 bits for kernel address -> (56 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
*/
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + \
+ (1ULL << (__VIRTUAL_MASK_SHIFT - \
+ KASAN_SHADOW_SCALE_SHIFT)))
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9f2e3102e0bb..367d99cff426 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,8 @@ extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
asmlinkage void kretprobe_trampoline(void);
+extern void arch_kprobe_override_function(struct pt_regs *regs);
+
/* Architecture specific copy of original instruction*/
struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 516798431328..dd6f57a54a26 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -86,7 +86,7 @@
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
- | X86_CR4_SMAP | X86_CR4_PKE))
+ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -504,6 +504,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
u64 smbase;
+ u64 smi_count;
bool tpr_access_reporting;
u64 ia32_xss;
@@ -760,6 +761,15 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -847,6 +857,8 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+
+ struct kvm_sev_info sev_info;
};
struct kvm_vm_stat {
@@ -883,7 +895,6 @@ struct kvm_vcpu_stat {
u64 request_irq_exits;
u64 irq_exits;
u64 host_state_reload;
- u64 efer_reload;
u64 fpu_reload;
u64 insn_emulation;
u64 insn_emulation_fail;
@@ -965,7 +976,7 @@ struct kvm_x86_ops {
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- void (*tlb_flush)(struct kvm_vcpu *vcpu);
+ void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
void (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu);
@@ -1017,6 +1028,7 @@ struct kvm_x86_ops {
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
+ bool (*umip_emulated)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
@@ -1079,6 +1091,10 @@ struct kvm_x86_ops {
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+
+ int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+ int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
};
struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 8bf450b13d9f..25283f7eb299 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
#define hv_set_synint_state(int_num, val) wrmsrl(int_num, val)
void hyperv_callback_vector(void);
+void hyperv_reenlightenment_vector(void);
#ifdef CONFIG_TRACING
#define trace_hyperv_callback_vector hyperv_callback_vector
#endif
@@ -314,20 +315,29 @@ void hyperv_init(void);
void hyperv_setup_mmu_ops(void);
void hyper_alloc_mmu(void);
void hyperv_report_panic(struct pt_regs *regs, long err);
-bool hv_is_hypercall_page_setup(void);
+bool hv_is_hyperv_initialized(void);
void hyperv_cleanup(void);
+
+void hyperv_reenlightenment_intr(struct pt_regs *regs);
+void set_hv_tscchange_cb(void (*cb)(void));
+void clear_hv_tscchange_cb(void);
+void hyperv_stop_tsc_emulation(void);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
-static inline bool hv_is_hypercall_page_setup(void) { return false; }
+static inline bool hv_is_hyperv_initialized(void) { return false; }
static inline void hyperv_cleanup(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
+static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
+static inline void clear_hv_tscchange_cb(void) {}
+static inline void hyperv_stop_tsc_emulation(void) {};
#endif /* CONFIG_HYPERV */
#ifdef CONFIG_HYPERV_TSCPAGE
struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc)
{
- u64 scale, offset, cur_tsc;
+ u64 scale, offset;
u32 sequence;
/*
@@ -358,7 +368,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
scale = READ_ONCE(tsc_pg->tsc_scale);
offset = READ_ONCE(tsc_pg->tsc_offset);
- cur_tsc = rdtsc_ordered();
+ *cur_tsc = rdtsc_ordered();
/*
* Make sure we read sequence after we read all other values
@@ -368,7 +378,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
} while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
- return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
+ return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
+}
+
+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+ u64 cur_tsc;
+
+ return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
}
#else
@@ -376,5 +393,12 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{
return NULL;
}
+
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc)
+{
+ BUG();
+ return U64_MAX;
+}
#endif
#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e520a1e6fc11..c9084dedfcfa 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -397,6 +397,8 @@
#define MSR_K7_PERFCTR3 0xc0010007
#define MSR_K7_CLK_CTL 0xc001001b
#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT 0
+#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 07962f5f6fba..30df295f6d94 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
* that some other imaginary CPU is updating continuously with a
* time stamp.
*/
- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
- "lfence", X86_FEATURE_LFENCE_RDTSC);
+ barrier_nospec();
return rdtsc();
}
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index d15d471348b8..76b058533e47 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -6,6 +6,7 @@
#include <asm/alternative.h>
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
#ifdef __ASSEMBLY__
@@ -150,7 +151,7 @@ extern char __indirect_thunk_end[];
* On VMEXIT we must ensure that no RSB predictions learned in the guest
* can be followed in the host, by overwriting the RSB completely. Both
* retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
*/
static inline void vmexit_fill_RSB(void)
{
@@ -164,10 +165,15 @@ static inline void vmexit_fill_RSB(void)
static inline void indirect_branch_prediction_barrier(void)
{
- alternative_input("",
- "call __ibp_barrier",
- X86_FEATURE_USE_IBPB,
- ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
+ asm volatile(ALTERNATIVE("",
+ "movl %[msr], %%ecx\n\t"
+ "movl %[val], %%eax\n\t"
+ "movl $0, %%edx\n\t"
+ "wrmsr",
+ X86_FEATURE_USE_IBPB)
+ : : [msr] "i" (MSR_IA32_PRED_CMD),
+ [val] "i" (PRED_CMD_IBPB)
+ : "eax", "ecx", "edx", "memory");
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4baa6bceb232..d652a3808065 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -52,10 +52,6 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
-#ifdef CONFIG_X86_MCE
-#define arch_unmap_kpfn arch_unmap_kpfn
-#endif
-
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 892df375b615..554841fab717 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -297,9 +297,9 @@ static inline void __flush_tlb_global(void)
{
PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
}
-static inline void __flush_tlb_single(unsigned long addr)
+static inline void __flush_tlb_one_user(unsigned long addr)
{
- PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
+ PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr);
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6ec54d01972d..f624f1f10316 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -217,7 +217,7 @@ struct pv_mmu_ops {
/* TLB operations */
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
- void (*flush_tlb_single)(unsigned long addr);
+ void (*flush_tlb_one_user)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 8a3ee355b422..92015c65fa2a 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -22,4 +22,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end,
void io_free_memtype(resource_size_t start, resource_size_t end);
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index bc4af5453802..f24df59c40b2 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
-#ifdef CONFIG_SMP
union split_pmd {
struct {
u32 pmd_low;
@@ -166,6 +165,8 @@ union split_pmd {
};
pmd_t pmd;
};
+
+#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
{
union split_pmd res, *orig = (union split_pmd *)pmdp;
@@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
#endif
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ pmd_t old;
+
+ /*
+ * If pmd has present bit cleared we can get away without expensive
+ * cmpxchg64: we can update pmdp half-by-half without racing with
+ * anybody.
+ */
+ if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+ union split_pmd old, new, *ptr;
+
+ ptr = (union split_pmd *)pmdp;
+
+ new.pmd = pmd;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
+ old.pmd_high = ptr->pmd_high;
+ ptr->pmd_high = new.pmd_high;
+ return old.pmd;
+ }
+
+ do {
+ old = *pmdp;
+ } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
+
+ return old;
+}
+#endif
+
#ifdef CONFIG_SMP
union split_pud {
struct {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e42b8943cb1a..63c2552b6b65 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud)
return pud_flags(pud) & _PAGE_RW;
}
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ if (IS_ENABLED(CONFIG_SMP)) {
+ return xchg(pmdp, pmd);
+ } else {
+ pmd_t old = *pmdp;
+ *pmdp = pmd;
+ return old;
+ }
+}
+#endif
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index e67c0620aec2..e55466760ff8 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -61,7 +61,7 @@ void paging_init(void);
#define kpte_clear_flush(ptep, vaddr) \
do { \
pte_clear(&init_mm, (vaddr), (ptep)); \
- __flush_tlb_one((vaddr)); \
+ __flush_tlb_one_kernel((vaddr)); \
} while (0)
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index ce245b0cdfca..0777e18a1d23 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
*/
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
-#define CPU_ENTRY_AREA_BASE \
- ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
+ & PMD_MASK)
#define PKMAP_BASE \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h
deleted file mode 100644
index d4855f11136d..000000000000
--- a/arch/x86/include/asm/pmc_core.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Intel Core SoC Power Management Controller Header File
- *
- * Copyright (c) 2016, Intel Corporation.
- * All Rights Reserved.
- *
- * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
- * Vishwanath Somayaji <vishwanath.somayaji@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- */
-
-#ifndef _ASM_PMC_CORE_H
-#define _ASM_PMC_CORE_H
-
-/* API to read SLP_S0_RESIDENCY counter */
-int intel_pmc_slp_s0_counter_read(u32 *data);
-
-#endif /* _ASM_PMC_CORE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index efbde088a718..1bd9ed87606f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -91,7 +91,7 @@ struct cpuinfo_x86 {
__u8 x86; /* CPU family */
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
- __u8 x86_mask;
+ __u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
@@ -109,7 +109,7 @@ struct cpuinfo_x86 {
char x86_vendor_id[16];
char x86_model_id[64];
/* in KB - valid for CPUS which support this call: */
- int x86_cache_size;
+ unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
/* Cache QoS architectural values: */
int x86_cache_max_rmid; /* max index */
@@ -460,8 +460,6 @@ struct thread_struct {
unsigned short gsindex;
#endif
- u32 status; /* thread synchronous flags */
-
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
@@ -507,6 +505,14 @@ struct thread_struct {
*/
};
+/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+static inline void arch_thread_struct_whitelist(unsigned long *offset,
+ unsigned long *size)
+{
+ *offset = offsetof(struct thread_struct, fpu.state);
+ *size = fpu_kernel_xstate_size;
+}
+
/*
* Thread-synchronous status.
*
@@ -971,7 +977,4 @@ bool xen_set_default_idle(void);
void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
-
-void __ibp_barrier(void);
-
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14131dd06b29..6de1fd3d0097 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
return regs->ax;
}
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+ regs->ax = rc;
+}
+
/*
* user_mode(regs) determines whether a register set came from user
* mode. On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 78dd9df88157..0487ac054870 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -146,6 +146,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+
struct __attribute__ ((__packed__)) vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index bdf9aed40403..1c6a6cb230ff 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -28,8 +28,6 @@ static inline void pci_swiotlb_late_init(void)
}
#endif
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
unsigned long attrs);
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
new file mode 100644
index 000000000000..c67caafd3381
--- /dev/null
+++ b/arch/x86/include/asm/sync_core.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_CORE_H
+#define _ASM_X86_SYNC_CORE_H
+
+#include <linux/preempt.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. x86 implements return to user-space through sysexit,
+ * sysrel, and sysretq, which are not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ /* With PTI, we unconditionally serialize before running user code. */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /*
+ * Return from interrupt and NMI is done through iret, which is core
+ * serializing.
+ */
+ if (in_irq() || in_nmi())
+ return;
+ sync_core();
+}
+
+#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index e3c95e8e61c5..03eedc21246d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
* TS_COMPAT is set for 32-bit syscall entries and then
* remains set until we return to user mode.
*/
- if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
/*
* Sign-extend the value so (int)-EFOO becomes (long)-EFOO
* and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task->thread.status & TS_COMPAT)
+ if (task->thread_info.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task->thread.status & TS_COMPAT)
+ if (task->thread_info.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d25a638a2720..a5d9521bb2cb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -55,6 +55,7 @@ struct task_struct;
struct thread_info {
unsigned long flags; /* low level flags */
+ u32 status; /* thread synchronous flags */
};
#define INIT_THREAD_INFO(tsk) \
@@ -219,7 +220,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#define in_ia32_syscall() true
#else
#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
- current->thread.status & TS_COMPAT)
+ current_thread_info()->status & TS_COMPAT)
#endif
/*
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d33e4a26dc7e..84137c22fdfa 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -140,7 +140,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#else
#define __flush_tlb() __native_flush_tlb()
#define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
static inline bool tlb_defer_switch_to_init_mm(void)
@@ -174,6 +174,8 @@ struct tlb_state {
struct mm_struct *loaded_mm;
u16 loaded_mm_asid;
u16 next_asid;
+ /* last user mm's ctx id */
+ u64 last_ctx_id;
/*
* We can be in one of several states:
@@ -398,7 +400,7 @@ static inline void __native_flush_tlb_global(void)
/*
* flush one page in the user mapping
*/
-static inline void __native_flush_tlb_single(unsigned long addr)
+static inline void __native_flush_tlb_one_user(unsigned long addr)
{
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
@@ -435,18 +437,31 @@ static inline void __flush_tlb_all(void)
/*
* flush one page in the kernel mapping
*/
-static inline void __flush_tlb_one(unsigned long addr)
+static inline void __flush_tlb_one_kernel(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
- __flush_tlb_single(addr);
+
+ /*
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
+ * use PCID if we also use global PTEs for the kernel mapping, and
+ * INVLPG flushes global translations across all address spaces.
+ *
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
+ * __flush_tlb_one_user() will flush the given address for the current
+ * kernel address space and for its usermode counterpart, but it does
+ * not flush it for other address spaces.
+ */
+ __flush_tlb_one_user(addr);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
- * __flush_tlb_single() will have cleared the TLB entry for this ASID,
- * but since kernel space is replicated across all, we must also
- * invalidate all others.
+ * See above. We need to propagate the flush to all other address
+ * spaces. In principle, we only need to propagate it to kernelmode
+ * address spaces, but the extra bookkeeping we would need is not
+ * worth it.
*/
invalidate_other_asid();
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 574dff4d2913..aae77eb8491c 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -124,6 +124,11 @@ extern int __get_user_bad(void);
#define __uaccess_begin() stac()
#define __uaccess_end() clac()
+#define __uaccess_begin_nospec() \
+({ \
+ stac(); \
+ barrier_nospec(); \
+})
/*
* This is a type: either unsigned long, if the argument fits into
@@ -445,7 +450,7 @@ do { \
({ \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
- __uaccess_begin(); \
+ __uaccess_begin_nospec(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
__uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
@@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; };
__uaccess_begin(); \
barrier();
+#define uaccess_try_nospec do { \
+ current->thread.uaccess_err = 0; \
+ __uaccess_begin_nospec(); \
+
#define uaccess_catch(err) \
__uaccess_end(); \
(err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
@@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; };
* get_user_ex(...);
* } get_user_catch(err)
*/
-#define get_user_try uaccess_try
+#define get_user_try uaccess_try_nospec
#define get_user_catch(err) uaccess_catch(err)
#define get_user_ex(x, ptr) do { \
@@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
- __uaccess_begin(); \
+ __uaccess_begin_nospec(); \
switch (size) { \
case 1: \
{ \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 72950401b223..ba2dc1930630 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
switch (n) {
case 1:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u8 *)to, from, ret,
"b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u16 *)to, from, ret,
"w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u32 *)to, from, ret,
"l", "k", "=r", 4);
__uaccess_end();
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f07ef3c575db..62546b3a398e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
case 1:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
case 8:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
case 10:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
if (likely(!ret))
@@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
__uaccess_end();
return ret;
case 16:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
if (likely(!ret))
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 1e901e421f2d..322681622d1e 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += bpf_perf_event.h
generated-y += unistd_32.h
generated-y += unistd_64.h
generated-y += unistd_x32.h
+generic-y += poll.h
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 1a5bfead93b4..197c2e6c7376 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -40,6 +40,9 @@
*/
#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
+/* AccessReenlightenmentControls privilege */
+#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
+
/*
* Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
* and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
@@ -234,6 +237,30 @@
#define HV_X64_MSR_CRASH_PARAMS \
(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+/* TSC emulation after migration */
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
+
+struct hv_reenlightenment_control {
+ u64 vector:8;
+ u64 reserved1:8;
+ u64 enabled:1;
+ u64 reserved2:15;
+ u64 target_vp:32;
+};
+
+#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
+#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
+
+struct hv_tsc_emulation_control {
+ u64 enabled:1;
+ u64 reserved:63;
+};
+
+struct hv_tsc_emulation_status {
+ u64 inprogress:1;
+ u64 reserved:63;
+};
+
#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 09cc06483bed..7a2ade4aa235 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -25,6 +25,7 @@
#define KVM_FEATURE_STEAL_TIME 5
#define KVM_FEATURE_PV_EOI 6
#define KVM_FEATURE_PV_UNHALT 7
+#define KVM_FEATURE_PV_TLB_FLUSH 9
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -51,6 +52,9 @@ struct kvm_steal_time {
__u32 pad[11];
};
+#define KVM_VCPU_PREEMPTED (1 << 0)
+#define KVM_VCPU_FLUSH_TLB (1 << 1)
+
#define KVM_CLOCK_PAIRING_WALLCLOCK 0
struct kvm_clock_pairing {
__s64 sec;
diff --git a/arch/x86/include/uapi/asm/poll.h b/arch/x86/include/uapi/asm/poll.h
deleted file mode 100644
index c98509d3149e..000000000000
--- a/arch/x86/include/uapi/asm/poll.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec3a286163c3..2aa92094b59d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -36,6 +36,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
#include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
#include <asm/e820/api.h>
#include <asm/irqdomain.h>
@@ -1625,6 +1626,8 @@ int __init acpi_boot_init(void)
if (!acpi_noirq)
x86_init.pci.init = pci_acpi_init;
+ /* Do not enable ACPI SPCR console by default */
+ acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
return 0;
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 30571fdaaf6f..a481763a3776 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);
-#ifdef CONFIG_PARAVIRT
-static int __initdata_or_module noreplace_paravirt = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
-{
- noreplace_paravirt = 1;
- return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
-#endif
-
#define DPRINTK(fmt, args...) \
do { \
if (debug_alternative) \
@@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
- if (noreplace_paravirt)
- return;
-
for (p = start; p < end; p++) {
unsigned int used;
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index cc0e8bc0ea3f..ecd486cb06ab 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -31,6 +31,7 @@
#include <linux/io.h>
#include <linux/gfp.h>
#include <linux/atomic.h>
+#include <linux/dma-direct.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 6db28f17ff28..c88e0b127810 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -235,7 +235,7 @@ int amd_cache_northbridges(void)
if (boot_cpu_data.x86 == 0x10 &&
boot_cpu_data.x86_model >= 0x8 &&
(boot_cpu_data.x86_model > 0x9 ||
- boot_cpu_data.x86_mask >= 0x1))
+ boot_cpu_data.x86_stepping >= 0x1))
amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
if (boot_cpu_data.x86 == 0x15)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 25ddf02598d2..b203af0855b5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -546,7 +546,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static u32 hsx_deadline_rev(void)
{
- switch (boot_cpu_data.x86_mask) {
+ switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
case 0x04: return 0x0f; /* EX */
}
@@ -556,7 +556,7 @@ static u32 hsx_deadline_rev(void)
static u32 bdx_deadline_rev(void)
{
- switch (boot_cpu_data.x86_mask) {
+ switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
case 0x03: return 0x0700000e;
case 0x04: return 0x0f00000c;
@@ -568,7 +568,7 @@ static u32 bdx_deadline_rev(void)
static u32 skx_deadline_rev(void)
{
- switch (boot_cpu_data.x86_mask) {
+ switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
case 0x04: return 0x02000014;
}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 46b675aaf20b..f11910b44638 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1176,16 +1176,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
uv_gre_table = gre;
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ unsigned long size = ((unsigned long)(gre->limit - lgre)
+ << UV_GAM_RANGE_SHFT);
+ int order = 0;
+ char suffix[] = " KMGTPE";
+
+ while (size > 9999 && order < sizeof(suffix)) {
+ size /= 1024;
+ order++;
+ }
+
if (!index) {
pr_info("UV: GAM Range Table...\n");
pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
}
- pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
+ pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n",
index++,
(unsigned long)lgre << UV_GAM_RANGE_SHFT,
(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
- ((unsigned long)(gre->limit - lgre)) >>
- (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
+ size, suffix[order],
gre->type, gre->nasid, gre->sockid, gre->pnode);
lgre = gre->limit;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index e4b0d92b3ae0..dfcbe6924eaf 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1506,7 +1506,7 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
return 0;
}
-static unsigned int do_poll(struct file *fp, poll_table *wait)
+static __poll_t do_poll(struct file *fp, poll_table *wait)
{
struct apm_user *as;
@@ -1515,7 +1515,7 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
return 0;
poll_wait(fp, &apm_waitqueue, wait);
if (!queue_empty(as))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
@@ -2389,6 +2389,7 @@ static int __init apm_init(void)
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
+ cpuidle_poll_state_init(&apm_idle_driver);
if (!cpuidle_register_driver(&apm_idle_driver))
if (cpuidle_register_device(&apm_cpuidle_device))
cpuidle_unregister_driver(&apm_idle_driver);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fa1261eefa16..f91ba53e06c8 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,7 +18,7 @@ void foo(void)
OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
- OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ea831c858195..f0e6456ca7d3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -119,7 +119,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
return;
}
- if (c->x86_model == 6 && c->x86_mask == 1) {
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
@@ -149,7 +149,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
- (c->x86_model == 8 && c->x86_mask < 8)) {
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
@@ -168,7 +168,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
return;
}
- if ((c->x86_model == 8 && c->x86_mask > 7) ||
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
@@ -221,7 +221,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
* are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
* As per AMD technical note 27212 0.2
*/
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
rdmsr(MSR_K7_CLK_CTL, l, h);
if ((l & 0xfff00000) != 0x20000000) {
pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
@@ -241,12 +241,12 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
return;
/* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
return;
/*
@@ -256,8 +256,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
(c->x86_model > 7))
if (cpu_has(c, X86_FEATURE_MP))
return;
@@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
}
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise the
+ * SEV feature (set in scattered.c).
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
+
+ rdmsrl(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
+
+ return;
+
+clear_all:
+ clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+ clear_cpu_cap(c, X86_FEATURE_SEV);
+ }
+}
+
static void early_init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
@@ -583,7 +628,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
- (c->x86_model == 8 && c->x86_mask >= 8))
+ (c->x86_model == 8 && c->x86_stepping >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
@@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_E400);
- /*
- * BIOS support is required for SME. If BIOS has enabled SME then
- * adjust x86_phys_bits by the SME physical address space reduction
- * value. If BIOS has not enabled SME then don't advertise the
- * feature (set in scattered.c). Also, since the SME support requires
- * long mode, don't advertise the feature under CONFIG_X86_32.
- */
- if (cpu_has(c, X86_FEATURE_SME)) {
- u64 msr;
-
- /* Check if SME is enabled */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
- c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
- if (IS_ENABLED(CONFIG_X86_32))
- clear_cpu_cap(c, X86_FEATURE_SME);
- } else {
- clear_cpu_cap(c, X86_FEATURE_SME);
- }
- }
+ early_detect_mem_encrypt(c);
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -769,7 +795,7 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
* Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
* all up to and including B1.
*/
- if (c->x86_model <= 1 && c->x86_mask <= 1)
+ if (c->x86_model <= 1 && c->x86_stepping <= 1)
set_cpu_cap(c, X86_FEATURE_CPB);
}
@@ -880,11 +906,11 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
/* Duron Rev A0 */
- if (c->x86_model == 3 && c->x86_mask == 0)
+ if (c->x86_model == 3 && c->x86_stepping == 0)
size = 64;
/* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
- (c->x86_mask == 0 || c->x86_mask == 1))
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
size = 256;
}
return size;
@@ -1021,7 +1047,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
}
/* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_mask;
+ ms = (cpu->x86_model << 4) | cpu->x86_stepping;
while ((range = *erratum++))
if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
(ms >= AMD_MODEL_RANGE_START(range)) &&
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3bfb2b23d79c..d71c8b54b696 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -103,7 +103,7 @@ bool retpoline_module_ok(bool has_retpoline)
if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
return true;
- pr_err("System may be vunerable to spectre v2\n");
+ pr_err("System may be vulnerable to spectre v2\n");
spectre_v2_bad_module = true;
return false;
}
@@ -119,13 +119,13 @@ static inline const char *spectre_v2_module_string(void) { return ""; }
static void __init spec2_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s\n", reason);
+ pr_info("%s selected on command line.\n", reason);
}
static void __init spec2_print_if_secure(const char *reason)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s\n", reason);
+ pr_info("%s selected on command line.\n", reason);
}
static inline bool retp_compiler(void)
@@ -140,42 +140,65 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
return len == arglen && !strncmp(arg, opt, len);
}
+static const struct {
+ const char *option;
+ enum spectre_v2_mitigation_cmd cmd;
+ bool secure;
+} mitigation_options[] = {
+ { "off", SPECTRE_V2_CMD_NONE, false },
+ { "on", SPECTRE_V2_CMD_FORCE, true },
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
+};
+
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
char arg[20];
- int ret;
-
- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
- sizeof(arg));
- if (ret > 0) {
- if (match_option(arg, ret, "off")) {
- goto disable;
- } else if (match_option(arg, ret, "on")) {
- spec2_print_if_secure("force enabled on command line.");
- return SPECTRE_V2_CMD_FORCE;
- } else if (match_option(arg, ret, "retpoline")) {
- spec2_print_if_insecure("retpoline selected on command line.");
- return SPECTRE_V2_CMD_RETPOLINE;
- } else if (match_option(arg, ret, "retpoline,amd")) {
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
- return SPECTRE_V2_CMD_AUTO;
- }
- spec2_print_if_insecure("AMD retpoline selected on command line.");
- return SPECTRE_V2_CMD_RETPOLINE_AMD;
- } else if (match_option(arg, ret, "retpoline,generic")) {
- spec2_print_if_insecure("generic retpoline selected on command line.");
- return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
- } else if (match_option(arg, ret, "auto")) {
+ int ret, i;
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+ return SPECTRE_V2_CMD_NONE;
+ else {
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+ if (!match_option(arg, ret, mitigation_options[i].option))
+ continue;
+ cmd = mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
return SPECTRE_V2_CMD_AUTO;
}
}
- if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ !IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
-disable:
- spec2_print_if_insecure("disabled on command line.");
- return SPECTRE_V2_CMD_NONE;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (mitigation_options[i].secure)
+ spec2_print_if_secure(mitigation_options[i].option);
+ else
+ spec2_print_if_insecure(mitigation_options[i].option);
+
+ return cmd;
}
/* Check for Skylake-like CPUs (for RSB handling) */
@@ -213,10 +236,10 @@ static void __init spectre_v2_select_mitigation(void)
return;
case SPECTRE_V2_CMD_FORCE:
- /* FALLTRHU */
case SPECTRE_V2_CMD_AUTO:
- goto retpoline_auto;
-
+ if (IS_ENABLED(CONFIG_RETPOLINE))
+ goto retpoline_auto;
+ break;
case SPECTRE_V2_CMD_RETPOLINE_AMD:
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_amd;
@@ -230,14 +253,14 @@ static void __init spectre_v2_select_mitigation(void)
goto retpoline_auto;
break;
}
- pr_err("kernel not compiled with retpoline; no mitigation available!");
+ pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
return;
retpoline_auto:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
retpoline_amd:
if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
- pr_err("LFENCE not serializing. Switching to generic retpoline\n");
+ pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
goto retpoline_generic;
}
mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
@@ -255,7 +278,7 @@ retpoline_auto:
pr_info("%s\n", spectre_v2_strings[mode]);
/*
- * If neither SMEP or KPTI are available, there is a risk of
+ * If neither SMEP nor PTI are available, there is a risk of
* hitting userspace addresses in the RSB after a context switch
* from a shallow call stack to a deeper one. To prevent this fill
* the entire RSB, even when using IBRS.
@@ -269,21 +292,20 @@ retpoline_auto:
if ((!boot_cpu_has(X86_FEATURE_PTI) &&
!boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Filling RSB on context switch\n");
+ pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
}
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
- pr_info("Enabling Indirect Branch Prediction Barrier\n");
+ pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
}
}
#undef pr_fmt
#ifdef CONFIG_SYSFS
-ssize_t cpu_show_meltdown(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return sprintf(buf, "Not affected\n");
@@ -292,16 +314,14 @@ ssize_t cpu_show_meltdown(struct device *dev,
return sprintf(buf, "Vulnerable\n");
}
-ssize_t cpu_show_spectre_v1(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
return sprintf(buf, "Not affected\n");
- return sprintf(buf, "Vulnerable\n");
+ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
}
-ssize_t cpu_show_spectre_v2(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return sprintf(buf, "Not affected\n");
@@ -311,9 +331,3 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
spectre_v2_module_string());
}
#endif
-
-void __ibp_barrier(void)
-{
- __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
-}
-EXPORT_SYMBOL_GPL(__ibp_barrier);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c578cd29c2d2..e5ec0f11c0de 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -140,7 +140,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_TSC);
break;
case 8:
- switch (c->x86_mask) {
+ switch (c->x86_stepping) {
default:
name = "2";
break;
@@ -215,7 +215,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
- (c->x86_mask == 1) && (size == 65))
+ (c->x86_stepping == 1) && (size == 65))
size -= 1;
return size;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7c996a692fd..824aee0117bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -731,7 +731,7 @@ void cpu_detect(struct cpuinfo_x86 *c)
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = x86_family(tfms);
c->x86_model = x86_model(tfms);
- c->x86_mask = x86_stepping(tfms);
+ c->x86_stepping = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -750,6 +750,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
}
}
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ *
+ * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
+ * features, which are visible in /proc/cpuinfo and used by the
+ * kernel. So set those accordingly from the Intel bits.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ }
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+}
+
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -844,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
init_scattered_cpuid_features(c);
+ init_speculation_control(c);
/*
* Clear/Set all flags overridden by options, after probe.
@@ -879,7 +900,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
-static const __initdata struct x86_cpu_id cpu_no_speculation[] = {
+static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
@@ -892,7 +913,7 @@ static const __initdata struct x86_cpu_id cpu_no_speculation[] = {
{}
};
-static const __initdata struct x86_cpu_id cpu_no_meltdown[] = {
+static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
{ X86_VENDOR_AMD },
{}
};
@@ -1163,9 +1184,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
int i;
c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
+ c->x86_cache_size = 0;
c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
@@ -1357,8 +1378,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
- if (c->x86_mask || c->cpuid_level >= 0)
- pr_cont(", stepping: 0x%x)\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
else
pr_cont(")\n");
}
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 6b4bb335641f..8949b7ae6d92 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -215,7 +215,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
- c->x86_mask = dir1 & 0xf;
+ c->x86_stepping = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6936d14d4c77..d19e903214b4 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -116,14 +116,13 @@ struct sku_microcode {
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
- { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
- { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 },
{ INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
{ INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
{ INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
{ INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
@@ -136,8 +135,6 @@ static const struct sku_microcode spectre_bad_microcodes[] = {
{ INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
{ INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
- /* Updated in the 20180108 release; blacklist until we know otherwise */
- { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
/* Observed in the wild */
{ INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
{ INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
@@ -149,7 +146,7 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
if (c->x86_model == spectre_bad_microcodes[i].model &&
- c->x86_mask == spectre_bad_microcodes[i].stepping)
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
return false;
@@ -175,28 +172,17 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
- /*
- * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
- * and they also have a different bit for STIBP support. Also,
- * a hypervisor might have set the individual AMD bits even on
- * Intel CPUs, for finer-grained selection of what's available.
- */
- if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
- set_cpu_cap(c, X86_FEATURE_IBRS);
- set_cpu_cap(c, X86_FEATURE_IBPB);
- }
- if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
- set_cpu_cap(c, X86_FEATURE_STIBP);
-
/* Now if any of them are set, check the blacklist and clear the lot */
- if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
- clear_cpu_cap(c, X86_FEATURE_IBRS);
- clear_cpu_cap(c, X86_FEATURE_IBPB);
- clear_cpu_cap(c, X86_FEATURE_STIBP);
- clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL);
- clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
}
/*
@@ -207,7 +193,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
+ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -223,7 +209,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
/* CPUID workaround for 0F33/0F34 CPU */
if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -321,7 +307,7 @@ int ppro_with_ram_bug(void)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask < 8) {
+ boot_cpu_data.x86_stepping < 8) {
pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
@@ -338,7 +324,7 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
* Mask B, Pentium, but not Pentium MMX
*/
if (c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
c->x86_model <= 3) {
/*
* Remember we have B step Pentia with bugs
@@ -381,7 +367,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+ if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
@@ -399,7 +385,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
+ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -414,7 +400,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* Specification Update").
*/
if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
- (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
@@ -661,7 +647,7 @@ static void init_intel(struct cpuinfo_x86 *c)
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
- else if (c->x86_mask == 0 || c->x86_mask == 5)
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
p = "Celeron-A";
break;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 410629f10ad3..589b948e6e01 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -819,7 +819,7 @@ static __init void rdt_quirks(void)
cache_alloc_hsw_probe();
break;
case INTEL_FAM6_SKYLAKE_X:
- if (boot_cpu_data.x86_mask <= 4)
+ if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
index 7f85b76f43bc..97685a0c3175 100644
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -243,13 +243,13 @@ out:
return err ? err : buf - ubuf;
}
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
if (READ_ONCE(mcelog.next))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index aa0d5df9dc60..e956eb267061 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -115,4 +115,19 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
extern struct mca_config mca_cfg;
+#ifndef CONFIG_X86_64
+/*
+ * On 32-bit systems it would be difficult to safely unmap a poison page
+ * from the kernel 1:1 map because there are no non-canonical addresses that
+ * we can use to refer to the address without risking a speculative access.
+ * However, this isn't much of an issue because:
+ * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
+ * are only mapped into the kernel as needed
+ * 2) Few people would run a 32-bit kernel on a machine that supports
+ * recoverable errors because they have too much memory to boot 32-bit.
+ */
+static inline void mce_unmap_kpfn(unsigned long pfn) {}
+#define mce_unmap_kpfn mce_unmap_kpfn
+#endif
+
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c3655e0fc156..8ff94d1e2dce 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -14,7 +14,6 @@
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
-#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/uaccess.h>
@@ -106,6 +105,10 @@ static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn);
+#endif
+
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -235,7 +238,7 @@ static void __print_mce(struct mce *m)
m->cs, m->ip);
if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
pr_cont("\n");
}
@@ -591,7 +594,8 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
- memory_failure(pfn, MCE_VECTOR, 0);
+ if (!memory_failure(pfn, 0))
+ mce_unmap_kpfn(pfn);
}
return NOTIFY_OK;
@@ -1055,15 +1059,16 @@ static int do_memory_failure(struct mce *m)
pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
if (!(m->mcgstatus & MCG_STATUS_RIPV))
flags |= MF_MUST_KILL;
- ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+ ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
if (ret)
pr_err("Memory error not recovered");
+ else
+ mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
return ret;
}
-#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
-
-void arch_unmap_kpfn(unsigned long pfn)
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn)
{
unsigned long decoy_addr;
@@ -1074,7 +1079,7 @@ void arch_unmap_kpfn(unsigned long pfn)
* We would like to just call:
* set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
- * speculative access to the posion page because we'd have
+ * speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
@@ -1099,7 +1104,6 @@ void arch_unmap_kpfn(unsigned long pfn)
if (set_memory_np(decoy_addr, 1))
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
-
}
#endif
@@ -1334,7 +1338,7 @@ out_ist:
EXPORT_SYMBOL_GPL(do_machine_check);
#ifndef CONFIG_MEMORY_FAILURE
-int memory_failure(unsigned long pfn, int vector, int flags)
+int memory_failure(unsigned long pfn, int flags)
{
/* mce_severity() should not hand us an ACTION_REQUIRED error */
BUG_ON(flags & MF_ACTION_REQUIRED);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index e4fc595cd6ea..319dd65f98a2 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -560,7 +560,7 @@ static ssize_t pf_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR_WO(reload);
static DEVICE_ATTR(version, 0400, version_show, NULL);
static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f7c55b0e753a..a15db2b4e0d6 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -921,7 +921,7 @@ static bool is_blacklisted(unsigned int cpu)
*/
if (c->x86 == 6 &&
c->x86_model == INTEL_FAM6_BROADWELL_X &&
- c->x86_mask == 0x01 &&
+ c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
@@ -944,7 +944,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
return UCODE_NFOUND;
sprintf(name, "intel-ucode/%02x-%02x-%02x",
- c->x86, c->x86_model, c->x86_mask);
+ c->x86, c->x86_model, c->x86_stepping);
if (request_firmware_direct(&firmware, name, device)) {
pr_debug("data file %s load failed\n", name);
@@ -982,7 +982,7 @@ static struct microcode_ops microcode_intel_ops = {
static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
- u64 llc_size = c->x86_cache_size * 1024;
+ u64 llc_size = c->x86_cache_size * 1024ULL;
do_div(llc_size, c->x86_max_cores);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 85eb5fc180c8..9340f41ce8d3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void)
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+
+ /* Setup the IDT for reenlightenment notifications */
+ if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
+ hyperv_reenlightenment_vector);
+
#endif
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fdc55215d44d..e12ee86906c6 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -859,7 +859,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
*/
if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask <= 7) {
+ boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 40d5a8a75212..7468de429087 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -711,8 +711,8 @@ void __init mtrr_bp_init(void)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 0xF &&
boot_cpu_data.x86_model == 0x3 &&
- (boot_cpu_data.x86_mask == 0x3 ||
- boot_cpu_data.x86_mask == 0x4))
+ (boot_cpu_data.x86_stepping == 0x3 ||
+ boot_cpu_data.x86_stepping == 0x4))
phys_addr = 36;
size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7ecedafa1c8..2c8522a39ed5 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -72,8 +72,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
- if (c->x86_mask || c->cpuid_level >= 0)
- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
else
seq_puts(m, "stepping\t: unknown\n");
if (c->microcode)
@@ -91,8 +91,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
}
/* Cache size */
- if (c->x86_cache_size >= 0)
- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4075d2be5357..772c219b6889 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -30,6 +30,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
+ { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 76e07698e6d1..25de5f6ca997 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,7 +2,6 @@
/*
* Architecture specific OF callbacks.
*/
-#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/io.h>
#include <linux/interrupt.h>
@@ -39,11 +38,6 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
BUG();
}
-void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
-{
- return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
-}
-
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index afbecff161d1..a2d8a3908670 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,7 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
- bool partial;
+ bool partial = false;
printk("%sCall Trace:\n", log_lvl);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 1e82f787c160..bae0d32e327b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -243,7 +243,7 @@ static void __init intel_remapping_check(int num, int slot, int func)
#define KB(x) ((x) * 1024UL)
#define MB(x) (KB (KB (x)))
-static size_t __init i830_tseg_size(void)
+static resource_size_t __init i830_tseg_size(void)
{
u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
@@ -256,7 +256,7 @@ static size_t __init i830_tseg_size(void)
return KB(512);
}
-static size_t __init i845_tseg_size(void)
+static resource_size_t __init i845_tseg_size(void)
{
u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
@@ -273,7 +273,7 @@ static size_t __init i845_tseg_size(void)
return 0;
}
-static size_t __init i85x_tseg_size(void)
+static resource_size_t __init i85x_tseg_size(void)
{
u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
@@ -283,12 +283,12 @@ static size_t __init i85x_tseg_size(void)
return MB(1);
}
-static size_t __init i830_mem_size(void)
+static resource_size_t __init i830_mem_size(void)
{
return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
}
-static size_t __init i85x_mem_size(void)
+static resource_size_t __init i85x_mem_size(void)
{
return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
}
@@ -297,36 +297,36 @@ static size_t __init i85x_mem_size(void)
* On 830/845/85x the stolen memory base isn't available in any
* register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
*/
-static phys_addr_t __init i830_stolen_base(int num, int slot, int func,
- size_t stolen_size)
+static resource_size_t __init i830_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
{
- return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size;
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
}
-static phys_addr_t __init i845_stolen_base(int num, int slot, int func,
- size_t stolen_size)
+static resource_size_t __init i845_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
{
- return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size;
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
}
-static phys_addr_t __init i85x_stolen_base(int num, int slot, int func,
- size_t stolen_size)
+static resource_size_t __init i85x_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
{
- return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size;
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
}
-static phys_addr_t __init i865_stolen_base(int num, int slot, int func,
- size_t stolen_size)
+static resource_size_t __init i865_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
{
u16 toud = 0;
toud = read_pci_config_16(0, 0, 0, I865_TOUD);
- return (phys_addr_t)(toud << 16) + i845_tseg_size();
+ return toud * KB(64) + i845_tseg_size();
}
-static phys_addr_t __init gen3_stolen_base(int num, int slot, int func,
- size_t stolen_size)
+static resource_size_t __init gen3_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
{
u32 bsm;
@@ -337,10 +337,10 @@ static phys_addr_t __init gen3_stolen_base(int num, int slot, int func,
*/
bsm = read_pci_config(num, slot, func, INTEL_BSM);
- return (phys_addr_t)bsm & INTEL_BSM_MASK;
+ return bsm & INTEL_BSM_MASK;
}
-static size_t __init i830_stolen_size(int num, int slot, int func)
+static resource_size_t __init i830_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -361,7 +361,7 @@ static size_t __init i830_stolen_size(int num, int slot, int func)
return 0;
}
-static size_t __init gen3_stolen_size(int num, int slot, int func)
+static resource_size_t __init gen3_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -390,7 +390,7 @@ static size_t __init gen3_stolen_size(int num, int slot, int func)
return 0;
}
-static size_t __init gen6_stolen_size(int num, int slot, int func)
+static resource_size_t __init gen6_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -398,10 +398,10 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
- return (size_t)gms * MB(32);
+ return gms * MB(32);
}
-static size_t __init gen8_stolen_size(int num, int slot, int func)
+static resource_size_t __init gen8_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -409,10 +409,10 @@ static size_t __init gen8_stolen_size(int num, int slot, int func)
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
- return (size_t)gms * MB(32);
+ return gms * MB(32);
}
-static size_t __init chv_stolen_size(int num, int slot, int func)
+static resource_size_t __init chv_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -426,14 +426,14 @@ static size_t __init chv_stolen_size(int num, int slot, int func)
* 0x17 to 0x1d: 4MB increments start at 36MB
*/
if (gms < 0x11)
- return (size_t)gms * MB(32);
+ return gms * MB(32);
else if (gms < 0x17)
- return (size_t)(gms - 0x11 + 2) * MB(4);
+ return (gms - 0x11) * MB(4) + MB(8);
else
- return (size_t)(gms - 0x17 + 9) * MB(4);
+ return (gms - 0x17) * MB(4) + MB(36);
}
-static size_t __init gen9_stolen_size(int num, int slot, int func)
+static resource_size_t __init gen9_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
u16 gms;
@@ -444,14 +444,15 @@ static size_t __init gen9_stolen_size(int num, int slot, int func)
/* 0x0 to 0xef: 32MB increments starting at 0MB */
/* 0xf0 to 0xfe: 4MB increments starting at 4MB */
if (gms < 0xf0)
- return (size_t)gms * MB(32);
+ return gms * MB(32);
else
- return (size_t)(gms - 0xf0 + 1) * MB(4);
+ return (gms - 0xf0) * MB(4) + MB(4);
}
struct intel_early_ops {
- size_t (*stolen_size)(int num, int slot, int func);
- phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size);
+ resource_size_t (*stolen_size)(int num, int slot, int func);
+ resource_size_t (*stolen_base)(int num, int slot, int func,
+ resource_size_t size);
};
static const struct intel_early_ops i830_early_ops __initconst = {
@@ -527,16 +528,20 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_SKL_IDS(&gen9_early_ops),
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
+ INTEL_CFL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
INTEL_CNL_IDS(&gen9_early_ops),
};
+struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
+EXPORT_SYMBOL(intel_graphics_stolen_res);
+
static void __init
intel_graphics_stolen(int num, int slot, int func,
const struct intel_early_ops *early_ops)
{
- phys_addr_t base, end;
- size_t size;
+ resource_size_t base, size;
+ resource_size_t end;
size = early_ops->stolen_size(num, slot, func);
base = early_ops->stolen_base(num, slot, func, size);
@@ -545,8 +550,12 @@ intel_graphics_stolen(int num, int slot, int func,
return;
end = base + size - 1;
- printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n",
- &base, &end);
+
+ intel_graphics_stolen_res.start = base;
+ intel_graphics_stolen_res.end = end;
+
+ printk(KERN_INFO "Reserving Intel graphics memory at %pR\n",
+ &intel_graphics_stolen_res);
/* Mark this space as reserved */
e820__range_add(base, size, E820_TYPE_RESERVED);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c29020907886..b59e4fb40fd9 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -37,7 +37,7 @@
#define X86 new_cpu_data+CPUINFO_x86
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
-#define X86_MASK new_cpu_data+CPUINFO_x86_mask
+#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
@@ -332,7 +332,7 @@ ENTRY(startup_32_smp)
shrb $4,%al
movb %al,X86_MODEL
andb $0x0f,%cl # mask mask revision
- movb %cl,X86_MASK
+ movb %cl,X86_STEPPING
movl %edx,X86_CAPABILITY
.Lis486:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 68e1867cca80..45fb4d2565f8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -142,6 +142,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Hypervisor callback interrupts\n");
}
#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HRE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_reenlightenment_count);
+ seq_puts(p, " Hyper-V reenlightenment interrupts\n");
+ }
+#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b40ffbf156c1..4e37d1a851a6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
update_intr_gate(X86_TRAP_PF, async_page_fault);
}
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_others(flushmask, info);
+}
+
static void __init kvm_guest_init(void)
{
int i;
@@ -517,6 +545,9 @@ static void __init kvm_guest_init(void)
pv_time_ops.steal_clock = kvm_steal_clock;
}
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH))
+ pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
@@ -598,6 +629,22 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+ int cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) {
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
+ return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
#ifdef CONFIG_PARAVIRT_SPINLOCKS
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
@@ -643,7 +690,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu)
{
struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
- return !!src->preempted;
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
}
PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 27d0a1712663..f1c5eb99d445 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -410,7 +410,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
processor.cpuflag = CPU_ENABLED;
processor.cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
processor.reserved[0] = 0;
processor.reserved[1] = 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 041096bdef86..99dc79e76bdc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -200,9 +200,9 @@ static void native_flush_tlb_global(void)
__native_flush_tlb_global();
}
-static void native_flush_tlb_single(unsigned long addr)
+static void native_flush_tlb_one_user(unsigned long addr)
{
- __native_flush_tlb_single(addr);
+ __native_flush_tlb_one_user(addr);
}
struct static_key paravirt_steal_enabled;
@@ -401,7 +401,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.flush_tlb_user = native_flush_tlb,
.flush_tlb_kernel = native_flush_tlb_global,
- .flush_tlb_single = native_flush_tlb_single,
+ .flush_tlb_one_user = native_flush_tlb_one_user,
.flush_tlb_others = native_flush_tlb_others,
.pgd_alloc = __paravirt_pgd_alloc,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 599d7462eccc..df7ab02f959f 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
#include <linux/dma-debug.h>
#include <linux/dmar.h>
#include <linux/export.h>
@@ -87,7 +87,6 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_mask = dma_alloc_coherent_mask(dev, flag);
- flag &= ~__GFP_ZERO;
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
@@ -139,7 +138,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
if (!*dev)
*dev = &x86_dma_fallback_dev;
- *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
*gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
if (!is_device_dma_capable(*dev))
@@ -217,7 +215,7 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int x86_dma_supported(struct device *dev, u64 mask)
+int arch_dma_supported(struct device *dev, u64 mask)
{
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
@@ -226,12 +224,6 @@ int x86_dma_supported(struct device *dev, u64 mask)
}
#endif
- /* Copied from i386. Doesn't make much sense, because it will
- only work for pci_alloc_coherent.
- The caller just has to use GFP_DMA in this case. */
- if (mask < DMA_BIT_MASK(24))
- return 0;
-
/* Tell the device to use SAC when IOMMU force is on. This
allows the driver to use cheaper accesses in some cases.
@@ -251,6 +243,17 @@ int x86_dma_supported(struct device *dev, u64 mask)
return 1;
}
+EXPORT_SYMBOL(arch_dma_supported);
+
+int x86_dma_supported(struct device *dev, u64 mask)
+{
+ /* Copied from i386. Doesn't make much sense, because it will
+ only work for pci_alloc_coherent.
+ The caller just has to use GFP_DMA in this case. */
+ if (mask < DMA_BIT_MASK(24))
+ return 0;
+ return 1;
+}
static int __init pci_iommu_init(void)
{
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index b0caae27e1b7..618285e475c6 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Fallback functions when the main IOMMU code is not compiled in. This
code is roughly equivalent to i386. */
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
#include <linux/scatterlist.h>
#include <linux/string.h>
#include <linux/gfp.h>
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 53bd05ea90d8..0ee0f8f34251 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,7 +6,7 @@
#include <linux/init.h>
#include <linux/swiotlb.h>
#include <linux/bootmem.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
#include <linux/mem_encrypt.h>
#include <asm/iommu.h>
@@ -48,7 +48,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
-static const struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops x86_swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc = x86_swiotlb_alloc_coherent,
.free = x86_swiotlb_free_coherent,
@@ -112,7 +112,7 @@ void __init pci_swiotlb_init(void)
{
if (swiotlb) {
swiotlb_init(0);
- dma_ops = &swiotlb_dma_ops;
+ dma_ops = &x86_swiotlb_dma_ops;
}
}
@@ -120,7 +120,7 @@ void __init pci_swiotlb_late_init(void)
{
/* An IOMMU turned us off. */
if (!swiotlb)
- swiotlb_free();
+ swiotlb_exit();
else {
printk(KERN_INFO "PCI-DMA: "
"Using software bounce buffering for IO (SWIOTLB)\n");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c75466232016..9eb448c7859d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -557,7 +557,7 @@ static void __set_personality_x32(void)
* Pretend to come from a x32 execve.
*/
task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
- current->thread.status &= ~TS_COMPAT;
+ current_thread_info()->status &= ~TS_COMPAT;
#endif
}
@@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
task_pt_regs(current)->orig_ax = __NR_ia32_execve;
- current->thread.status |= TS_COMPAT;
+ current_thread_info()->status |= TS_COMPAT;
#endif
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f37d18124648..ed5c4cdf0a34 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
*/
regs->orig_ax = value;
if (syscall_get_nr(child, regs) >= 0)
- child->thread.status |= TS_I386_REGS_POKED;
+ child->thread_info.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 307d3bac5f04..11eda21eb697 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -68,6 +68,9 @@ relocate_kernel:
movq %cr4, %rax
movq %rax, CR4(%r11)
+ /* Save CR4. Required to enable the right paging mode later. */
+ movq %rax, %r13
+
/* zero out flags, and disable interrupts */
pushq $0
popfq
@@ -126,8 +129,13 @@ identity_mapped:
/*
* Set cr4 to a known state:
* - physical address extension enabled
+ * - 5-level paging, if it was enabled before
*/
movl $X86_CR4_PAE, %eax
+ testq $X86_CR4_LA57, %r13
+ jz 1f
+ orl $X86_CR4_LA57, %eax
+1:
movq %rax, %cr4
jmp 1f
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b9e00e8f1c9b..4cdc0b27ec82 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* than the tracee.
*/
#ifdef CONFIG_IA32_EMULATION
- if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 8c6da1a643da..ac057f9b0763 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -25,8 +25,8 @@ static inline void signal_compat_build_tests(void)
* limits also have to look at this code. Make sure any
* new fields are handled in copy_siginfo_to_user32()!
*/
- BUILD_BUG_ON(NSIGILL != 8);
- BUILD_BUG_ON(NSIGFPE != 8);
+ BUILD_BUG_ON(NSIGILL != 11);
+ BUILD_BUG_ON(NSIGFPE != 13);
BUILD_BUG_ON(NSIGSEGV != 4);
BUILD_BUG_ON(NSIGBUS != 5);
BUILD_BUG_ON(NSIGTRAP != 4);
@@ -64,7 +64,7 @@ static inline void signal_compat_build_tests(void)
CHECK_SI_SIZE (_kill, 2*sizeof(int));
CHECK_CSI_OFFSET(_timer);
- CHECK_CSI_SIZE (_timer, 5*sizeof(int));
+ CHECK_CSI_SIZE (_timer, 3*sizeof(int));
CHECK_SI_SIZE (_timer, 6*sizeof(int));
CHECK_CSI_OFFSET(_rt);
@@ -75,9 +75,11 @@ static inline void signal_compat_build_tests(void)
CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+#ifdef CONFIG_X86_X32_ABI
CHECK_CSI_OFFSET(_sigchld_x32);
CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
/* no _sigchld_x32 in the generic siginfo_t */
+#endif
CHECK_CSI_OFFSET(_sigfault);
CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
@@ -96,6 +98,8 @@ static inline void signal_compat_build_tests(void)
void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
{
+ signal_compat_build_tests();
+
/* Don't leak in-kernel non-uapi flags to user-space */
if (oact)
oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
@@ -111,116 +115,3 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
if (in_x32_syscall())
act->sa.sa_flags |= SA_X32_ABI;
}
-
-int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
- bool x32_ABI)
-{
- int err = 0;
-
- signal_compat_build_tests();
-
- if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- put_user_try {
- /* If you change siginfo_t structure, please make sure that
- this code is fixed accordingly.
- It should never copy any pad contained in the structure
- to avoid security leaks, but must copy the generic
- 3 ints plus the relevant union member. */
- put_user_ex(from->si_signo, &to->si_signo);
- put_user_ex(from->si_errno, &to->si_errno);
- put_user_ex(from->si_code, &to->si_code);
-
- if (from->si_code < 0) {
- put_user_ex(from->si_pid, &to->si_pid);
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
- } else {
- /*
- * First 32bits of unions are always present:
- * si_pid === si_band === si_tid === si_addr(LS half)
- */
- put_user_ex(from->_sifields._pad[0],
- &to->_sifields._pad[0]);
- switch (siginfo_layout(from->si_signo, from->si_code)) {
- case SIL_FAULT:
- if (from->si_signo == SIGBUS &&
- (from->si_code == BUS_MCEERR_AR ||
- from->si_code == BUS_MCEERR_AO))
- put_user_ex(from->si_addr_lsb, &to->si_addr_lsb);
-
- if (from->si_signo == SIGSEGV) {
- if (from->si_code == SEGV_BNDERR) {
- compat_uptr_t lower = (unsigned long)from->si_lower;
- compat_uptr_t upper = (unsigned long)from->si_upper;
- put_user_ex(lower, &to->si_lower);
- put_user_ex(upper, &to->si_upper);
- }
- if (from->si_code == SEGV_PKUERR)
- put_user_ex(from->si_pkey, &to->si_pkey);
- }
- break;
- case SIL_SYS:
- put_user_ex(from->si_syscall, &to->si_syscall);
- put_user_ex(from->si_arch, &to->si_arch);
- break;
- case SIL_CHLD:
- if (!x32_ABI) {
- put_user_ex(from->si_utime, &to->si_utime);
- put_user_ex(from->si_stime, &to->si_stime);
- } else {
- put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
- put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
- }
- put_user_ex(from->si_status, &to->si_status);
- /* FALL THROUGH */
- case SIL_KILL:
- put_user_ex(from->si_uid, &to->si_uid);
- break;
- case SIL_POLL:
- put_user_ex(from->si_fd, &to->si_fd);
- break;
- case SIL_TIMER:
- put_user_ex(from->si_overrun, &to->si_overrun);
- put_user_ex(ptr_to_compat(from->si_ptr),
- &to->si_ptr);
- break;
- case SIL_RT:
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(from->si_int, &to->si_int);
- break;
- }
- }
- } put_user_catch(err);
-
- return err;
-}
-
-/* from syscall's path, where we know the ABI */
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
- return __copy_siginfo_to_user32(to, from, in_x32_syscall());
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
- int err = 0;
- u32 ptr32;
-
- if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(to->si_signo, &from->si_signo);
- get_user_ex(to->si_errno, &from->si_errno);
- get_user_ex(to->si_code, &from->si_code);
-
- get_user_ex(to->si_pid, &from->si_pid);
- get_user_ex(to->si_uid, &from->si_uid);
- get_user_ex(ptr32, &from->si_ptr);
- to->si_ptr = compat_ptr(ptr32);
- } get_user_catch(err);
-
- return err;
-}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6f27facbaa9b..cfc61e1d45e2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1430,7 +1430,6 @@ static void remove_siblinginfo(int cpu)
cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
- c->phys_proc_id = 0;
c->cpu_core_id = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
recompute_smt_state();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 446c9ef8cfc3..3d9b2308e7fa 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -181,7 +181,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr)
break;
case BUG_TRAP_TYPE_WARN:
- regs->ip += LEN_UD0;
+ regs->ip += LEN_UD2;
return 1;
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3df51c287844..92fd433c50b9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -81,6 +81,14 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
+config KVM_AMD_SEV
+ def_bool y
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP
+ ---help---
+ Provides support for launching Encrypted VMs on AMD processors.
+
config KVM_MMU_AUDIT
bool "Audit KVM MMU"
depends on KVM && TRACEPOINTS
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0099e10eb045..a0c5a69bc7c4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void)
#define F(x) bit(X86_FEATURE_##x)
-/* These are scattered features in cpufeatures.h. */
-#define KVM_CPUID_BIT_AVX512_4VNNIW 2
-#define KVM_CPUID_BIT_AVX512_4FMAPS 3
+/* For scattered features from cpufeatures.h; we currently expose none */
#define KF(x) bit(KVM_CPUID_BIT_##x)
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
@@ -293,13 +291,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
{
switch (func) {
case 0:
- entry->eax = 1; /* only one leaf currently */
+ entry->eax = 7;
++*nent;
break;
case 1:
entry->ecx = F(MOVBE);
++*nent;
break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (index == 0)
+ entry->ecx = F(RDPID);
+ ++*nent;
default:
break;
}
@@ -327,6 +330,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+ unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -365,7 +369,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+ F(TOPOEXT);
+
+ /* cpuid 0x80000008.ebx */
+ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(IBPB) | F(IBRS);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -387,12 +396,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) |
- 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -473,11 +484,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx |= F(TSC_ADJUST);
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ entry->ecx |= f_umip;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
- entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
+ cpuid_mask(&entry->edx, CPUID_7_EDX);
} else {
entry->ebx = 0;
entry->ecx = 0;
@@ -594,7 +606,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_PV_EOI) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
- (1 << KVM_FEATURE_PV_UNHALT);
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH);
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
@@ -604,7 +617,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->edx = 0;
break;
case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001a);
+ entry->eax = min(entry->eax, 0x8000001f);
break;
case 0x80000001:
entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
@@ -627,7 +640,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (!g_phys_as)
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
- entry->ebx = entry->edx = 0;
+ entry->edx = 0;
+ /* IBRS and IBPB aren't necessarily present in hardware cpuid */
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ entry->ebx |= F(IBPB);
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ entry->ebx |= F(IBRS);
+ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
break;
}
case 0x80000019:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c2cea6651279..9a327d5b6d1f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
[CPUID_7_ECX] = { 7, 0, CPUID_ECX},
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
};
static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 290ecf711aec..d91eaeb01034 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3533,6 +3533,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ return emulate_gp(ctxt, 0);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
u64 tsc = 0;
@@ -3652,17 +3662,27 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
{
- if (ctxt->modrm_reg > VCPU_SREG_GS)
- return emulate_ud(ctxt);
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
- ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
ctxt->dst.bytes = 2;
return X86EMUL_CONTINUE;
}
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
{
u16 sel = ctxt->src.val;
@@ -3678,6 +3698,11 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
}
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
static int em_lldt(struct x86_emulate_ctxt *ctxt)
{
u16 sel = ctxt->src.val;
@@ -3687,6 +3712,11 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt)
return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
}
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
static int em_ltr(struct x86_emulate_ctxt *ctxt)
{
u16 sel = ctxt->src.val;
@@ -3739,6 +3769,10 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
{
struct desc_ptr desc_ptr;
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->op_bytes = 8;
get(ctxt, &desc_ptr);
@@ -3798,6 +3832,10 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
static int em_smsw(struct x86_emulate_ctxt *ctxt)
{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
if (ctxt->dst.type == OP_MEM)
ctxt->dst.bytes = 2;
ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
@@ -4383,8 +4421,8 @@ static const struct opcode group5[] = {
};
static const struct opcode group6[] = {
- DI(Prot | DstMem, sldt),
- DI(Prot | DstMem, str),
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
II(Prot | Priv | SrcMem16, em_lldt, lldt),
II(Prot | Priv | SrcMem16, em_ltr, ltr),
N, N, N, N,
@@ -4415,10 +4453,20 @@ static const struct opcode group8[] = {
F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
static const struct group_dual group9 = { {
N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
- N, N, N, N, N, N, N, N,
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
} };
static const struct opcode group11[] = {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 5c24811e8b0b..f171051eecf3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -79,7 +79,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
if (kvm_cpu_has_extint(v))
return 1;
- if (kvm_vcpu_apicv_active(v))
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e2c1fb8d35ce..924ac8ce9d50 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -364,32 +364,41 @@ static u8 count_vectors(void *bitmap)
return count;
}
-int __kvm_apic_update_irr(u32 *pir, void *regs)
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
{
u32 i, vec;
- u32 pir_val, irr_val;
- int max_irr = -1;
+ u32 pir_val, irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
for (i = vec = 0; i <= 7; i++, vec += 32) {
pir_val = READ_ONCE(pir[i]);
irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
if (pir_val) {
+ prev_irr_val = irr_val;
irr_val |= xchg(&pir[i], 0);
*((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
+ if (prev_irr_val != irr_val) {
+ max_updated_irr =
+ __fls(irr_val ^ prev_irr_val) + vec;
+ }
}
if (irr_val)
- max_irr = __fls(irr_val) + vec;
+ *max_irr = __fls(irr_val) + vec;
}
- return max_irr;
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- return __kvm_apic_update_irr(pir, apic->regs);
+ return __kvm_apic_update_irr(pir, apic->regs, max_irr);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -581,7 +590,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+ if (apic->vcpu->arch.apicv_active)
highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4b9935a38347..56c36014f7b7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -75,8 +75,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
-int __kvm_apic_update_irr(u32 *pir, void *regs);
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4da4d08..46ff304140c7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -42,6 +42,7 @@
#include <linux/kern_levels.h>
#include <asm/page.h>
+#include <asm/pat.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/vmx.h>
@@ -381,7 +382,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_clear_all_pte_masks(void)
{
shadow_user_mask = 0;
shadow_accessed_mask = 0;
@@ -2708,7 +2709,18 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
- return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
return true;
}
@@ -4951,6 +4963,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
emulate:
+ /*
+ * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
+ * This can happen if a guest gets a page-fault on data access but the HW
+ * table walker is not able to read the instruction page (e.g instruction
+ * page is not present in memory). In those cases we simply restart the
+ * guest.
+ */
+ if (unlikely(insn && !insn_len))
+ return 1;
+
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
switch (er) {
@@ -5058,7 +5080,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
/* The caller should hold mmu-lock before calling this function. */
-static bool
+static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
@@ -5088,7 +5110,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
return flush;
}
-static bool
+static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
bool lock_flush_tlb)
@@ -5099,7 +5121,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
lock_flush_tlb);
}
-static bool
+static __always_inline bool
slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
@@ -5107,7 +5129,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
}
-static bool
+static __always_inline bool
slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
@@ -5115,7 +5137,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
}
-static bool
+static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index d22ddbdf5e6e..1272861e77b9 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,7 +19,7 @@
#include <linux/ratelimit.h>
-char const *audit_point_name[] = {
+static char const *audit_point_name[] = {
"pre page fault",
"post page fault",
"pre pte write",
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f40d0da1f1d3..b3e488a74828 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -37,6 +37,10 @@
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
#include <linux/frame.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -184,6 +188,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
+ u64 spec_ctrl;
+
u32 *msrpm;
ulong nmi_iret_rip;
@@ -212,6 +218,9 @@ struct vcpu_svm {
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
+
+ /* which host CPU was used for running this vcpu */
+ unsigned int last_cpu;
};
/*
@@ -249,6 +258,8 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -285,8 +296,12 @@ module_param(vls, int, 0444);
static int vgif = true;
module_param(vgif, int, 0444);
+/* enable/disable SEV support */
+static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev, int, 0444);
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
@@ -320,6 +335,38 @@ enum {
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+static unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+static inline bool svm_sev_enabled(void)
+{
+ return max_sev_asid;
+}
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+ return sev->active;
+}
+
+static inline int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+ return sev->asid;
+}
+
static inline void mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -526,9 +573,14 @@ struct svm_cpu_data {
u64 asid_generation;
u32 max_asid;
u32 next_asid;
+ u32 min_asid;
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -783,6 +835,7 @@ static int svm_hardware_enable(void)
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
@@ -841,6 +894,7 @@ static void svm_cpu_uninit(int cpu)
return;
per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+ kfree(sd->sev_vmcbs);
__free_page(sd->save_area);
kfree(sd);
}
@@ -854,11 +908,18 @@ static int svm_cpu_init(int cpu)
if (!sd)
return -ENOMEM;
sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
r = -ENOMEM;
+ sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
goto err_1;
+ if (svm_sev_enabled()) {
+ r = -ENOMEM;
+ sd->sev_vmcbs = kmalloc((max_sev_asid + 1) * sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ goto err_1;
+ }
+
per_cpu(svm_data, cpu) = sd;
return 0;
@@ -880,6 +941,25 @@ static bool valid_msr_intercept(u32 index)
return false;
}
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
@@ -1046,6 +1126,48 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
+static __init int sev_hardware_setup(void)
+{
+ struct sev_user_data_status *status;
+ int rc;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = cpuid_ecx(0x8000001F);
+
+ if (!max_sev_asid)
+ return 1;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = cpuid_edx(0x8000001F);
+
+ /* Initialize SEV ASID bitmap */
+ sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ return 1;
+
+ status = kmalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return 1;
+
+ /*
+ * Check SEV platform status.
+ *
+ * PLATFORM_STATUS can be called in any state, if we failed to query
+ * the PLATFORM status then either PSP firmware does not support SEV
+ * feature or SEV firmware is dead.
+ */
+ rc = sev_platform_status(status, NULL);
+ if (rc)
+ goto err;
+
+ pr_info("SEV supported\n");
+
+err:
+ kfree(status);
+ return rc;
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1081,6 +1203,17 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
+ if (sev) {
+ if (boot_cpu_has(X86_FEATURE_SEV) &&
+ IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
+ r = sev_hardware_setup();
+ if (r)
+ sev = false;
+ } else {
+ sev = false;
+ }
+ }
+
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
@@ -1142,6 +1275,9 @@ static __exit void svm_hardware_unsetup(void)
{
int cpu;
+ if (svm_sev_enabled())
+ kfree(sev_asid_bitmap);
+
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1294,7 +1430,7 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
- control->nested_ctl = 1;
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1332,6 +1468,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
+ if (sev_guest(svm->vcpu.kvm)) {
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+ }
+
mark_all_dirty(svm->vmcb);
enable_gif(svm);
@@ -1414,6 +1555,179 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static void __sev_asid_free(int asid)
+{
+ struct svm_cpu_data *sd;
+ int cpu, pos;
+
+ pos = asid - 1;
+ clear_bit(pos, sev_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu(svm_data, cpu);
+ sd->sev_vmcbs[pos] = NULL;
+ }
+}
+
+static void sev_asid_free(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+ __sev_asid_free(sev->asid);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_decommission *decommission;
+ struct sev_data_deactivate *data;
+
+ if (!handle)
+ return;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ /* deactivate handle */
+ data->handle = handle;
+ sev_guest_deactivate(data, NULL);
+
+ wbinvd_on_all_cpus();
+ sev_guest_df_flush(NULL);
+ kfree(data);
+
+ decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+ if (!decommission)
+ return;
+
+ /* decommission handle */
+ decommission->handle = handle;
+ sev_guest_decommission(decommission, NULL);
+
+ kfree(decommission);
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ unsigned long npages, npinned, size;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ int first, last;
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return NULL;
+ }
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = vmalloc(size);
+ else
+ pages = kmalloc(size, GFP_KERNEL);
+
+ if (!pages)
+ return NULL;
+
+ /* Pin the user virtual address. */
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ release_pages(pages, npinned);
+
+ kvfree(pages);
+ return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+ release_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (npages == 0 || pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_atomic(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_atomic(page_virtual);
+ }
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+static void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ mutex_lock(&kvm->lock);
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ }
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(kvm);
+}
+
static void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -1432,6 +1746,12 @@ static void avic_vm_destroy(struct kvm *kvm)
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+}
+
static int avic_vm_init(struct kvm *kvm)
{
unsigned long flags;
@@ -1582,6 +1902,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
u32 dummy;
u32 eax = 1;
+ svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
MSR_IA32_APICBASE_ENABLE;
@@ -1703,11 +2025,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So do a full IBPB now.
+ */
+ indirect_branch_prediction_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
if (unlikely(cpu != vcpu->cpu)) {
@@ -1736,6 +2064,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ indirect_branch_prediction_barrier();
+ }
avic_vcpu_load(vcpu, cpu);
}
@@ -2030,7 +2362,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ svm_flush_tlb(vcpu, true);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
@@ -2089,7 +2421,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
if (sd->next_asid > sd->max_asid) {
++sd->asid_generation;
- sd->next_asid = 1;
+ sd->next_asid = sd->min_asid;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
}
@@ -2137,22 +2469,24 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
static int pf_interception(struct vcpu_svm *svm)
{
- u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
- svm->vmcb->control.insn_bytes,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
static int npf_interception(struct vcpu_svm *svm)
{
- u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
- svm->vmcb->control.insn_bytes,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
@@ -2379,7 +2713,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
- svm_flush_tlb(vcpu);
+ svm_flush_tlb(vcpu, true);
}
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -2921,7 +3255,8 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
if (vmcb->control.asid == 0)
return false;
- if (vmcb->control.nested_ctl && !npt_enabled)
+ if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+ !npt_enabled)
return false;
return true;
@@ -2935,7 +3270,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
- if (nested_vmcb->control.nested_ctl) {
+ if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
kvm_mmu_unload(&svm->vcpu);
svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
nested_svm_init_mmu_context(&svm->vcpu);
@@ -2983,7 +3318,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- svm_flush_tlb(&svm->vcpu);
+ svm_flush_tlb(&svm->vcpu, true);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -3593,6 +3928,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ msr_info->data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
@@ -3684,6 +4026,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ svm->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ if (is_guest_mode(vcpu))
+ break;
+ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ break;
case MSR_STAR:
svm->vmcb->save.star = data;
break;
@@ -4356,12 +4741,39 @@ static void reload_tss(struct kvm_vcpu *vcpu)
load_TR_desc();
}
+static void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->vmcb->control.asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->last_cpu == cpu)
+ return;
+
+ svm->last_cpu = cpu;
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
static void pre_svm_run(struct vcpu_svm *svm)
{
int cpu = raw_smp_processor_id();
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ if (sev_guest(svm->vcpu.kvm))
+ return pre_sev_run(svm, cpu);
+
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
@@ -4779,7 +5191,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
return 0;
}
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4936,6 +5348,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -5028,6 +5449,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
@@ -5092,7 +5534,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_CR);
- svm_flush_tlb(vcpu);
+ svm_flush_tlb(vcpu, true);
}
static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -5106,7 +5548,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
mark_dirty(svm->vmcb, VMCB_CR);
- svm_flush_tlb(vcpu);
+ svm_flush_tlb(vcpu, true);
}
static int is_disabled(void)
@@ -5192,6 +5634,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
entry->edx |= SVM_FEATURE_NPT;
break;
+ case 0x8000001F:
+ /* Support memory encryption cpuid if host supports it */
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ cpuid(0x8000001f, &entry->eax, &entry->ebx,
+ &entry->ecx, &entry->edx);
+
}
}
@@ -5220,6 +5668,11 @@ static bool svm_xsaves_supported(void)
return false;
}
+static bool svm_umip_emulated(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -5521,6 +5974,828 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static int sev_asid_new(void)
+{
+ int pos;
+
+ /*
+ * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ */
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return -EBUSY;
+
+ set_bit(pos, sev_asid_bitmap);
+ return pos + 1;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ int asid, ret;
+
+ ret = -EBUSY;
+ asid = sev_asid_new();
+ if (asid < 0)
+ return ret;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ sev->active = true;
+ sev->asid = asid;
+ INIT_LIST_HEAD(&sev->regions_list);
+
+ return 0;
+
+e_free:
+ __sev_asid_free(asid);
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate *data;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ wbinvd_on_all_cpus();
+
+ ret = sev_guest_df_flush(error);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* activate ASID on the given handle */
+ data->handle = handle;
+ data->asid = asid;
+ ret = sev_guest_activate(data, error);
+ kfree(data);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct sev_data_launch_start *start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ start = kzalloc(sizeof(*start), GFP_KERNEL);
+ if (!start)
+ return -ENOMEM;
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob)) {
+ ret = PTR_ERR(dh_blob);
+ goto e_free;
+ }
+
+ start->dh_cert_address = __sme_set(__pa(dh_blob));
+ start->dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start->session_address = __sme_set(__pa(session_blob));
+ start->session_len = params.session_len;
+ }
+
+ start->handle = params.handle;
+ start->policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start->handle, error);
+ if (ret)
+ goto e_free_session;
+
+ /* return handle to userspace */
+ params.handle = start->handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start->handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start->handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+e_free:
+ kfree(start);
+ return ret;
+}
+
+static int get_num_contig_pages(int idx, struct page **inpages,
+ unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ int i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data *data;
+ struct page **inpages;
+ int i, ret, pages;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (!inpages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The LAUNCH_UPDATE command will perform in-place encryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data->handle = sev->handle;
+ data->len = len;
+ data->address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct sev_data_launch_measure *data;
+ struct kvm_sev_launch_measure params;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ if (params.uaddr) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ }
+
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct sev_data_launch_finish *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(data);
+ return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ if (ret)
+ goto e_free;
+
+ params.policy = data->policy;
+ params.state = data->state;
+ params.handle = data->handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct sev_data_dbg *data;
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ data->dst_addr = dst;
+ data->src_addr = src;
+ data->len = size;
+
+ ret = sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ data, error);
+ kfree(data);
+ return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ src_paddr = round_down(src_paddr, 16);
+ offset = src_paddr & 15;
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+ page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user vaddr,
+ unsigned long dst_paddr,
+ unsigned long __user dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED(vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage),
+ (void __user *)(uintptr_t)vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ (void __user *)(uintptr_t)vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr, dst_vaddr_end;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ int ret, size;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+ dst_vaddr_end = dst_vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (!src_p)
+ return -EFAULT;
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (!dst_p) {
+ sev_unpin_memory(kvm, src_p, n);
+ return -EFAULT;
+ }
+
+ /*
+ * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, 1);
+ sev_unpin_memory(kvm, dst_p, 1);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct sev_data_launch_secret *data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (!pages)
+ return -ENOMEM;
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto e_unpin_memory;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_free;
+ }
+
+ data->trans_address = __psp_pa(blob);
+ data->trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data->trans_address = __psp_pa(blob);
+ data->trans_len = params.trans_len;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+e_unpin_memory:
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!svm_sev_enabled())
+ return -ENOTTY;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int svm_register_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (!region->pages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ mutex_lock(&kvm->lock);
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+
+static int svm_unregister_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5537,7 +6812,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.vcpu_reset = svm_vcpu_reset,
.vm_init = avic_vm_init,
- .vm_destroy = avic_vm_destroy,
+ .vm_destroy = svm_vm_destroy,
.prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load,
@@ -5597,6 +6872,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
+ .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
@@ -5613,6 +6889,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.invpcid_supported = svm_invpcid_supported,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
+ .umip_emulated = svm_umip_emulated,
.set_supported_cpuid = svm_set_supported_cpuid,
@@ -5636,6 +6913,10 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.pre_enter_smm = svm_pre_enter_smm,
.pre_leave_smm = svm_pre_leave_smm,
.enable_smi_window = enable_smi_window,
+
+ .mem_enc_op = svm_mem_enc_op,
+ .mem_enc_reg_region = svm_register_enc_region,
+ .mem_enc_unreg_region = svm_unregister_enc_region,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a8b96dc4cd83..3dec126aa302 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
#include <linux/tboot.h>
#include <linux/hrtimer.h>
#include <linux/frame.h>
+#include <linux/nospec.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -111,6 +112,14 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW 3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+#define MSR_BITMAP_MODE_LM 4
+
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
@@ -185,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -210,6 +218,7 @@ struct loaded_vmcs {
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
};
@@ -226,7 +235,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -409,12 +418,11 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
+/*
+ * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
+ * supported VMCS12 field encoding.
+ */
+#define VMCS12_MAX_FIELD_INDEX 0x17
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
@@ -439,16 +447,17 @@ struct nested_vmx {
* data hold by vmcs12
*/
bool sync_shadow_vmcs;
+ bool dirty_vmcs12;
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -457,8 +466,6 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;
- unsigned long *msr_bitmap;
-
struct hrtimer preemption_timer;
bool preemption_timer_expired;
@@ -581,6 +588,7 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
+ u8 msr_bitmap_mode;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -592,6 +600,10 @@ struct vcpu_vmx {
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
+
+ u64 arch_capabilities;
+ u64 spec_ctrl;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -659,6 +671,8 @@ struct vcpu_vmx {
u32 host_pkru;
+ unsigned long host_debugctlmsr;
+
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -687,67 +701,24 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
-#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
-#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
- [number##_HIGH] = VMCS12_OFFSET(name)+4
+#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) \
+ FIELD(number, name), \
+ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
-static unsigned long shadow_read_only_fields[] = {
- /*
- * We do NOT shadow fields that are modified when L0
- * traps and emulates any vmx instruction (e.g. VMPTRLD,
- * VMXON...) executed by L1.
- * For example, VM_INSTRUCTION_ERROR is read
- * by L1 if a vmx instruction fails (part of the error path).
- * Note the code assumes this logic. If for some reason
- * we start shadowing these fields then we need to
- * force a shadow sync when L0 emulates vmx instructions
- * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
- * by nested_vmx_failValid)
- */
- VM_EXIT_REASON,
- VM_EXIT_INTR_INFO,
- VM_EXIT_INSTRUCTION_LEN,
- IDT_VECTORING_INFO_FIELD,
- IDT_VECTORING_ERROR_CODE,
- VM_EXIT_INTR_ERROR_CODE,
- EXIT_QUALIFICATION,
- GUEST_LINEAR_ADDRESS,
- GUEST_PHYSICAL_ADDRESS
+static u16 shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x) x,
+#include "vmx_shadow_fields.h"
};
static int max_shadow_read_only_fields =
ARRAY_SIZE(shadow_read_only_fields);
-static unsigned long shadow_read_write_fields[] = {
- TPR_THRESHOLD,
- GUEST_RIP,
- GUEST_RSP,
- GUEST_CR0,
- GUEST_CR3,
- GUEST_CR4,
- GUEST_INTERRUPTIBILITY_INFO,
- GUEST_RFLAGS,
- GUEST_CS_SELECTOR,
- GUEST_CS_AR_BYTES,
- GUEST_CS_LIMIT,
- GUEST_CS_BASE,
- GUEST_ES_BASE,
- GUEST_BNDCFGS,
- CR0_GUEST_HOST_MASK,
- CR0_READ_SHADOW,
- CR4_READ_SHADOW,
- TSC_OFFSET,
- EXCEPTION_BITMAP,
- CPU_BASED_VM_EXEC_CONTROL,
- VM_ENTRY_EXCEPTION_ERROR_CODE,
- VM_ENTRY_INTR_INFO_FIELD,
- VM_ENTRY_INSTRUCTION_LEN,
- VM_ENTRY_EXCEPTION_ERROR_CODE,
- HOST_FS_BASE,
- HOST_GS_BASE,
- HOST_FS_SELECTOR,
- HOST_GS_SELECTOR
+static u16 shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x) x,
+#include "vmx_shadow_fields.h"
};
static int max_shadow_read_write_fields =
ARRAY_SIZE(shadow_read_write_fields);
@@ -898,21 +869,22 @@ static const unsigned short vmcs_field_to_offset_table[] = {
static inline short vmcs_field_to_offset(unsigned long field)
{
- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
+ unsigned short offset;
+ unsigned index;
- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+ if (field >> 15)
return -ENOENT;
- /*
- * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
- * generic mechanism.
- */
- asm("lfence");
-
- if (vmcs_field_to_offset_table[field] == 0)
+ index = ROL16(field, 6);
+ if (index >= size)
return -ENOENT;
- return vmcs_field_to_offset_table[field];
+ index = array_index_nospec(index, size);
+ offset = vmcs_field_to_offset_table[index];
+ if (offset == 0)
+ return -ENOENT;
+ return offset;
}
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
@@ -935,6 +907,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -952,14 +927,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
enum {
- VMX_IO_BITMAP_A,
- VMX_IO_BITMAP_B,
- VMX_MSR_BITMAP_LEGACY,
- VMX_MSR_BITMAP_LONGMODE,
- VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
- VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
- VMX_MSR_BITMAP_LEGACY_X2APIC,
- VMX_MSR_BITMAP_LONGMODE_X2APIC,
VMX_VMREAD_BITMAP,
VMX_VMWRITE_BITMAP,
VMX_BITMAP_NR
@@ -967,14 +934,6 @@ enum {
static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
-#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
-#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
-#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
-#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
-#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
-#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
-#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
@@ -1918,6 +1877,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(EXCEPTION_BITMAP, eb);
}
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
@@ -2296,6 +2301,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
+ indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -2333,6 +2339,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx_vcpu_pi_load(vcpu, cpu);
vmx->host_pkru = read_pkru();
+ vmx->host_debugctlmsr = get_debugctlmsr();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2572,36 +2579,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp;
}
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- unsigned long *msr_bitmap;
-
- if (is_guest_mode(vcpu))
- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
- else if (cpu_has_secondary_exec_ctrls() &&
- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
- } else {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
- }
- } else {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
- }
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -2642,7 +2619,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;
if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(&vmx->vcpu);
+ vmx_update_msr_bitmap(&vmx->vcpu);
}
/*
@@ -2920,7 +2897,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
/* highest index: VMX_PREEMPTION_TIMER_VALUE */
- vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+ vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
}
/*
@@ -3256,6 +3233,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
switch (msr_info->index) {
@@ -3267,8 +3245,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(to_vmx(vcpu));
- msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ vmx_load_host_state(vmx);
+ msr_info->data = vmx->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
@@ -3276,6 +3254,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->arch_capabilities;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3294,13 +3286,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
- !(to_vmx(vcpu)->msr_ia32_feature_control &
+ !(vmx->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
case MSR_IA32_FEATURE_CONTROL:
- msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
+ msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
@@ -3317,7 +3309,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
/* Otherwise falls through */
default:
- msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
+ msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
@@ -3383,6 +3375,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ vmx->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vmx->arch_capabilities = data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -3639,7 +3695,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
#endif
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_INVLPG_EXITING |
@@ -3669,6 +3725,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_DESC |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -3837,11 +3894,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
return vmcs;
}
-static struct vmcs *alloc_vmcs(void)
-{
- return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
static void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
@@ -3857,9 +3909,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}
+static struct vmcs *alloc_vmcs(void)
+{
+ return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs();
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs_init(loaded_vmcs);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ }
+ return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
+}
+
static void free_kvm_area(void)
{
int cpu;
@@ -3870,17 +3951,17 @@ static void free_kvm_area(void)
}
}
-enum vmcs_field_type {
- VMCS_FIELD_TYPE_U16 = 0,
- VMCS_FIELD_TYPE_U64 = 1,
- VMCS_FIELD_TYPE_U32 = 2,
- VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+enum vmcs_field_width {
+ VMCS_FIELD_WIDTH_U16 = 0,
+ VMCS_FIELD_WIDTH_U64 = 1,
+ VMCS_FIELD_WIDTH_U32 = 2,
+ VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
};
-static inline int vmcs_field_type(unsigned long field)
+static inline int vmcs_field_width(unsigned long field)
{
if (0x1 & field) /* the *_HIGH fields are all 32 bit */
- return VMCS_FIELD_TYPE_U32;
+ return VMCS_FIELD_WIDTH_U32;
return (field >> 13) & 0x3 ;
}
@@ -3893,43 +3974,66 @@ static void init_vmcs_shadow_fields(void)
{
int i, j;
- /* No checks for read only fields yet */
+ for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+ u16 field = shadow_read_only_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_only_fields ||
+ shadow_read_only_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_only_field %x\n",
+ field + 1);
+
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
+ if (j < i)
+ shadow_read_only_fields[j] = field;
+ j++;
+ }
+ max_shadow_read_only_fields = j;
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
- switch (shadow_read_write_fields[i]) {
- case GUEST_BNDCFGS:
- if (!kvm_mpx_supported())
+ u16 field = shadow_read_write_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_write_fields ||
+ shadow_read_write_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_write_field %x\n",
+ field + 1);
+
+ /*
+ * PML and the preemption timer can be emulated, but the
+ * processor cannot vmwrite to fields that don't exist
+ * on bare metal.
+ */
+ switch (field) {
+ case GUEST_PML_INDEX:
+ if (!cpu_has_vmx_pml())
+ continue;
+ break;
+ case VMX_PREEMPTION_TIMER_VALUE:
+ if (!cpu_has_vmx_preemption_timer())
+ continue;
+ break;
+ case GUEST_INTR_STATUS:
+ if (!cpu_has_vmx_apicv())
continue;
break;
default:
break;
}
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
if (j < i)
- shadow_read_write_fields[j] =
- shadow_read_write_fields[i];
+ shadow_read_write_fields[j] = field;
j++;
}
max_shadow_read_write_fields = j;
-
- /* shadowed fields guest access without vmexit */
- for (i = 0; i < max_shadow_read_write_fields; i++) {
- unsigned long field = shadow_read_write_fields[i];
-
- clear_bit(field, vmx_vmwrite_bitmap);
- clear_bit(field, vmx_vmread_bitmap);
- if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
- clear_bit(field + 1, vmx_vmwrite_bitmap);
- clear_bit(field + 1, vmx_vmread_bitmap);
- }
- }
- for (i = 0; i < max_shadow_read_only_fields; i++) {
- unsigned long field = shadow_read_only_fields[i];
-
- clear_bit(field, vmx_vmread_bitmap);
- if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
- clear_bit(field + 1, vmx_vmread_bitmap);
- }
}
static __init int alloc_kvm_area(void)
@@ -4142,9 +4246,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
+ bool invalidate_gpa)
{
- if (enable_ept) {
+ if (enable_ept && (invalidate_gpa || !enable_vpid)) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
@@ -4153,15 +4258,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
}
}
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
}
static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
{
if (enable_ept)
- vmx_flush_tlb(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -4359,7 +4464,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
ept_load_pdptrs(vcpu);
}
- vmx_flush_tlb(vcpu);
+ vmx_flush_tlb(vcpu, true);
vmcs_writel(GUEST_CR3, guest_cr3);
}
@@ -4376,6 +4481,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+
if (cr4 & X86_CR4_VMXE) {
/*
* To use VMXON (and later other VMX instructions), a guest
@@ -4918,10 +5031,8 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -4955,6 +5066,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
}
}
+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ else
+ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+}
+
/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4965,11 +5120,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
{
int f = sizeof(unsigned long);
- if (!cpu_has_vmx_msr_bitmap()) {
- WARN_ON(1);
- return;
- }
-
/*
* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
* have the write-low and read-high bitmap offsets the wrong way round.
@@ -5001,30 +5151,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
{
- if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
- msr, MSR_TYPE_R | MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
- msr, MSR_TYPE_R | MSR_TYPE_W);
+ u8 mode = 0;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode |= MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ }
+
+ if (is_long_mode(vcpu))
+ mode |= MSR_BITMAP_MODE_LM;
+
+ return mode;
}
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+ u8 mode)
{
- if (apicv_active) {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
- msr, type);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
- msr, type);
- } else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, type);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, type);
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+
+ if (mode & MSR_BITMAP_MODE_X2APIC) {
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ }
}
}
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ u8 mode = vmx_msr_bitmap_mode(vcpu);
+ u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+ if (!changed)
+ return;
+
+ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_LM));
+
+ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+ vmx->msr_bitmap_mode = mode;
+}
+
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return enable_apicv;
@@ -5069,7 +5259,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
vapic_page = kmap(vmx->nested.virtual_apic_page);
- __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+ vapic_page, &max_irr);
kunmap(vmx->nested.virtual_apic_page);
status = vmcs_read16(GUEST_INTR_STATUS);
@@ -5129,14 +5320,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
- /* the PIR and ON have been set by L1. */
- kvm_vcpu_trigger_posted_interrupt(vcpu, true);
/*
* If a posted intr is not recognized by hardware,
* we will accomplish it in the next vmentry.
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ /* the PIR and ON have been set by L1. */
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+ kvm_vcpu_kick(vcpu);
return 0;
}
return -1;
@@ -5274,7 +5466,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}
if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5315,6 +5507,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
struct kvm_vcpu *vcpu = &vmx->vcpu;
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
@@ -5333,6 +5526,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+ * in vmx_set_cr4. */
+ exec_control &= ~SECONDARY_EXEC_DESC;
+
/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
(handle_vmptrld).
We can NOT enable shadow_vmcs here because we don't have yet
@@ -5452,16 +5650,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
#endif
int i;
- /* I/O */
- vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
-
if (enable_shadow_vmcs) {
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
}
if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
@@ -5539,6 +5733,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
@@ -5567,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
u64 cr0;
vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@@ -6107,6 +6304,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -6563,7 +6766,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu) &&
!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
trace_kvm_fast_mmio(gpa);
- return kvm_skip_emulated_instruction(vcpu);
+ /*
+ * Doing kvm_skip_emulated_instruction() depends on undefined
+ * behavior: Intel's manual doesn't mandate
+ * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+ * occurs and while on real hardware it was observed to be set,
+ * other hypervisors (namely Hyper-V) don't set it, we end up
+ * advancing IP with some random value. Disable fast mmio when
+ * running nested and keep it for real hardware in hope that
+ * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ return kvm_skip_emulated_instruction(vcpu);
+ else
+ return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+ NULL, 0) == EMULATE_DONE;
}
ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -6744,7 +6961,7 @@ void vmx_enable_tdp(void)
static __init int hardware_setup(void)
{
- int r = -ENOMEM, i, msr;
+ int r = -ENOMEM, i;
rdmsrl_safe(MSR_EFER, &host_efer);
@@ -6760,13 +6977,6 @@ static __init int hardware_setup(void)
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
- memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-
- memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
goto out;
@@ -6779,11 +6989,6 @@ static __init int hardware_setup(void)
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
- if (!cpu_has_vmx_shadow_vmcs())
- enable_shadow_vmcs = 0;
- if (enable_shadow_vmcs)
- init_vmcs_shadow_fields();
-
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
!cpu_has_vmx_ept_mt_wb() ||
@@ -6835,42 +7040,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_legacy_x2apic,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
-
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
- for (msr = 0x800; msr <= 0x8ff; msr++) {
- if (msr == 0x839 /* TMCCT */)
- continue;
- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
- }
-
- /*
- * TPR reads and writes can be virtualized even if virtual interrupt
- * delivery is not in use.
- */
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
-
- /* EOI */
- vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
- /* SELF-IPI */
- vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
-
if (enable_ept)
vmx_enable_tdp();
else
@@ -6903,6 +7074,11 @@ static __init int hardware_setup(void)
kvm_x86_ops->cancel_hv_timer = NULL;
}
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs)
+ init_vmcs_shadow_fields();
+
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
kvm_mce_cap_supported |= MCG_LMCE_P;
@@ -6974,94 +7150,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
}
/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
-/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
* by Vol 2B, VMX Instruction Reference, "Conventions".
@@ -7241,13 +7329,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ int r;
- if (cpu_has_vmx_msr_bitmap()) {
- vmx->nested.msr_bitmap =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx->nested.msr_bitmap)
- goto out_msr_bitmap;
- }
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
+ goto out_vmcs02;
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12)
@@ -7264,9 +7350,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7278,9 +7361,9 @@ out_shadow_vmcs:
kfree(vmx->nested.cached_vmcs12);
out_cached_vmcs12:
- free_page((unsigned long)vmx->nested.msr_bitmap);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
-out_msr_bitmap:
+out_vmcs02:
return -ENOMEM;
}
@@ -7423,10 +7506,6 @@ static void free_nested(struct vcpu_vmx *vmx)
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- if (vmx->nested.msr_bitmap) {
- free_page((unsigned long)vmx->nested.msr_bitmap);
- vmx->nested.msr_bitmap = NULL;
- }
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -7434,7 +7513,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7529,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL;
}
- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/* Emulate the VMXOFF instruction */
@@ -7493,8 +7572,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));
- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -7532,17 +7609,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
p = ((char *)(get_vmcs12(vcpu))) + offset;
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
*ret = *((natural_width *)p);
return 0;
- case VMCS_FIELD_TYPE_U16:
+ case VMCS_FIELD_WIDTH_U16:
*ret = *((u16 *)p);
return 0;
- case VMCS_FIELD_TYPE_U32:
+ case VMCS_FIELD_WIDTH_U32:
*ret = *((u32 *)p);
return 0;
- case VMCS_FIELD_TYPE_U64:
+ case VMCS_FIELD_WIDTH_U64:
*ret = *((u64 *)p);
return 0;
default:
@@ -7559,17 +7636,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
if (offset < 0)
return offset;
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_U16:
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_U16:
*(u16 *)p = field_value;
return 0;
- case VMCS_FIELD_TYPE_U32:
+ case VMCS_FIELD_WIDTH_U32:
*(u32 *)p = field_value;
return 0;
- case VMCS_FIELD_TYPE_U64:
+ case VMCS_FIELD_WIDTH_U64:
*(u64 *)p = field_value;
return 0;
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
*(natural_width *)p = field_value;
return 0;
default:
@@ -7585,7 +7662,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
unsigned long field;
u64 field_value;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
- const unsigned long *fields = shadow_read_write_fields;
+ const u16 *fields = shadow_read_write_fields;
const int num_fields = max_shadow_read_write_fields;
preempt_disable();
@@ -7594,23 +7671,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
for (i = 0; i < num_fields; i++) {
field = fields[i];
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_U16:
- field_value = vmcs_read16(field);
- break;
- case VMCS_FIELD_TYPE_U32:
- field_value = vmcs_read32(field);
- break;
- case VMCS_FIELD_TYPE_U64:
- field_value = vmcs_read64(field);
- break;
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
- field_value = vmcs_readl(field);
- break;
- default:
- WARN_ON(1);
- continue;
- }
+ field_value = __vmcs_readl(field);
vmcs12_write_any(&vmx->vcpu, field, field_value);
}
@@ -7622,7 +7683,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{
- const unsigned long *fields[] = {
+ const u16 *fields[] = {
shadow_read_write_fields,
shadow_read_only_fields
};
@@ -7641,24 +7702,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
vmcs12_read_any(&vmx->vcpu, field, &field_value);
-
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_U16:
- vmcs_write16(field, (u16)field_value);
- break;
- case VMCS_FIELD_TYPE_U32:
- vmcs_write32(field, (u32)field_value);
- break;
- case VMCS_FIELD_TYPE_U64:
- vmcs_write64(field, (u64)field_value);
- break;
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
- vmcs_writel(field, (long)field_value);
- break;
- default:
- WARN_ON(1);
- break;
- }
+ __vmcs_writel(field, field_value);
}
}
@@ -7727,8 +7771,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
unsigned long field;
gva_t gva;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
/* The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
@@ -7771,6 +7817,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
+ switch (field) {
+#define SHADOW_FIELD_RW(x) case x:
+#include "vmx_shadow_fields.h"
+ /*
+ * The fields that can be updated by L1 without a vmexit are
+ * always updated in the vmcs02, the others go down the slow
+ * path of prepare_vmcs02.
+ */
+ break;
+ default:
+ vmx->nested.dirty_vmcs12 = true;
+ break;
+ }
+
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -7785,6 +7845,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
__pa(vmx->vmcs01.shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
}
+ vmx->nested.dirty_vmcs12 = true;
}
/* Emulate the VMPTRLD instruction */
@@ -8005,7 +8066,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
@@ -8199,6 +8260,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_GDTR_IDTR] = handle_desc,
+ [EXIT_REASON_LDTR_TR] = handle_desc,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
@@ -8406,10 +8469,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -8943,7 +9007,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -9007,36 +9071,23 @@ static void vmx_set_rvi(int vector)
static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
- if (!is_guest_mode(vcpu)) {
- vmx_set_rvi(max_irr);
- return;
- }
-
- if (max_irr == -1)
- return;
-
/*
- * In guest mode. If a vmexit is needed, vmx_check_nested_events
- * handles it.
+ * When running L2, updating RVI is only relevant when
+ * vmcs12 virtual-interrupt-delivery enabled.
+ * However, it can be enabled only when L1 also
+ * intercepts external-interrupts and in that case
+ * we should not update vmcs02 RVI but instead intercept
+ * interrupt. Therefore, do nothing when running L2.
*/
- if (nested_exit_on_intr(vcpu))
- return;
-
- /*
- * Else, fall back to pre-APICv interrupt injection since L2
- * is run without virtual interrupt delivery.
- */
- if (!kvm_event_needs_reinjection(vcpu) &&
- vmx_interrupt_allowed(vcpu)) {
- kvm_queue_interrupt(vcpu, max_irr, false);
- vmx_inject_irq(vcpu);
- }
+ if (!is_guest_mode(vcpu))
+ vmx_set_rvi(max_irr);
}
static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
+ bool max_irr_updated;
WARN_ON(!vcpu->arch.apicv_active);
if (pi_test_on(&vmx->pi_desc)) {
@@ -9046,7 +9097,23 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+ max_irr_updated =
+ kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+
+ /*
+ * If we are running L2 and L1 has a new pending interrupt
+ * which can be injected, we should re-evaluate
+ * what should be done with this new L1 interrupt.
+ * If L1 intercepts external-interrupts, we should
+ * exit from L2 to L1. Otherwise, interrupt should be
+ * delivered directly to L2.
+ */
+ if (is_guest_mode(vcpu) && max_irr_updated) {
+ if (nested_exit_on_intr(vcpu))
+ kvm_vcpu_exiting_guest_mode(vcpu);
+ else
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
}
@@ -9161,6 +9228,12 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
}
+static bool vmx_umip_emulated(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_DESC;
+}
+
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -9316,7 +9389,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr, cr3, cr4;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -9369,10 +9442,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
- debugctlmsr = get_debugctlmsr();
vmx_arm_hv_timer(vcpu);
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -9491,12 +9572,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (debugctlmsr)
- update_debugctlmsr(debugctlmsr);
+ if (vmx->host_debugctlmsr)
+ update_debugctlmsr(vmx->host_debugctlmsr);
#ifndef CONFIG_X86_64
/*
@@ -9576,10 +9678,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int r;
- r = vcpu_load(vcpu);
- BUG_ON(r);
+ vcpu_load(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
free_nested(vmx);
vcpu_put(vcpu);
@@ -9604,6 +9704,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ unsigned long *msr_bitmap;
int cpu;
if (!vmx)
@@ -9636,13 +9737,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->guest_msrs)
goto free_pml;
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx->loaded_vmcs->vmcs = alloc_vmcs();
- vmx->loaded_vmcs->shadow_vmcs = NULL;
- if (!vmx->loaded_vmcs->vmcs)
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
goto free_msrs;
- loaded_vmcs_init(vmx->loaded_vmcs);
+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx->msr_bitmap_mode = 0;
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
@@ -9771,7 +9879,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
u32 mask =
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_DESC;
u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -9937,8 +10046,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
}
}
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12);
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
@@ -10027,10 +10136,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
(unsigned long)(vmcs12->posted_intr_desc_addr &
(PAGE_SIZE - 1)));
}
- if (cpu_has_vmx_msr_bitmap() &&
- nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
- nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
- ;
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
else
vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
CPU_BASED_USE_MSR_BITMAPS);
@@ -10099,48 +10207,90 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
*/
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
{
int msr;
struct page *page;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+ /*
+ * pred_cmd & spec_ctrl are trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+ * from the L12 MSR bitmap that is too permissive.
+ * 2. That L1 or L2s have actually used the MSR. This avoids
+ * unnecessarily merging of the bitmap if the MSR is unused. This
+ * works properly because we only update the L01 MSR bitmap lazily.
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+ * updated to reflect this when L1 (or its L2s) actually write to
+ * the MSR.
+ */
+ bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+ bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+
+ /* Nothing to do if the MSR bitmap is not in use. */
+ if (!cpu_has_vmx_msr_bitmap() ||
+ !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return false;
- /* This shortcut is ok because we support only x2APIC MSRs so far. */
- if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !pred_cmd && !spec_ctrl)
return false;
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
if (is_error_page(page))
return false;
+
msr_bitmap_l1 = (unsigned long *)kmap(page);
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
+ * just lets the processor take the value from the virtual-APIC page;
+ * take those 256 bits directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+ }
+ } else {
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap_l0[word] = ~0;
+ msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+ }
+ }
- memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_W);
- if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
- if (nested_cpu_has_apic_reg_virt(vmcs12))
- for (msr = 0x800; msr <= 0x8ff; msr++)
- nested_vmx_disable_intercept_for_msr(
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
+
+ if (spec_ctrl)
+ nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
- msr, MSR_TYPE_R);
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_R | MSR_TYPE_W);
+ if (pred_cmd)
nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- APIC_BASE_MSR + (APIC_TASKPRI >> 4),
- MSR_TYPE_R | MSR_TYPE_W);
-
- if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- APIC_BASE_MSR + (APIC_EOI >> 4),
- MSR_TYPE_W);
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
- MSR_TYPE_W);
- }
- }
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+
kunmap(page);
kvm_release_page_clean(page);
@@ -10406,25 +10556,12 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
return 0;
}
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
- */
-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- bool from_vmentry, u32 *entry_failure_code)
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control, vmcs12_exec_ctrl;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
@@ -10432,7 +10569,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
- vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
@@ -10442,15 +10578,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
- vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
- vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
- vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
@@ -10460,6 +10593,125 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+ * If enable_ept, L0 doesn't care about page faults and we should
+ * set all of these to L1's desires. However, if !enable_ept, L0 does
+ * care about (at least some) page faults, and because it is not easy
+ * (if at all possible?) to merge L0 and L1's desires, we simply ask
+ * to exit on each and every L2 page fault. This is done by setting
+ * MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ /*
+ * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+ * Some constant fields are set here by vmx_set_constant_host_state().
+ * Other fields are different per CPU, and will be set later when
+ * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+ */
+ vmx_set_constant_host_state(vmx);
+
+ /*
+ * Set the MSR load/store lists to match L0's settings.
+ */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (vmx_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+ if (enable_vpid) {
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ else
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ }
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry, u32 *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control, vmcs12_exec_ctrl;
+
+ /*
+ * First, the fields that are shadowed. This must be kept in sync
+ * with vmx_shadow_fields.h.
+ */
+
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+
+ /*
+ * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
+ * HOST_FS_BASE, HOST_GS_BASE.
+ */
+
if (from_vmentry &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
@@ -10482,16 +10734,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
} else {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
- vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vmcs12->guest_pending_dbg_exceptions);
- vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
- vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
-
- if (nested_cpu_has_xsaves(vmcs12))
- vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
@@ -10505,7 +10748,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
@@ -10516,25 +10758,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (nested_cpu_has_preemption_timer(vmcs12))
vmx_start_preemption_timer(vcpu);
- /*
- * Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
- * Note that below we don't need special code to set EB.PF beyond the
- * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
- * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
- * !enable_ept, EB.PF is 1, so the "or" will always be 1.
- */
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
-
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx->secondary_exec_control;
@@ -10553,22 +10776,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control |= vmcs12_exec_ctrl;
}
- /* All VMFUNCs are currently emulated through L0 vmexits. */
- if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
- vmcs_write64(VM_FUNCTION_CONTROL, 0);
-
- if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
- vmcs_write64(EOI_EXIT_BITMAP0,
- vmcs12->eoi_exit_bitmap0);
- vmcs_write64(EOI_EXIT_BITMAP1,
- vmcs12->eoi_exit_bitmap1);
- vmcs_write64(EOI_EXIT_BITMAP2,
- vmcs12->eoi_exit_bitmap2);
- vmcs_write64(EOI_EXIT_BITMAP3,
- vmcs12->eoi_exit_bitmap3);
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
- }
/*
* Write an illegal value to APIC_ACCESS_ADDR. Later,
@@ -10581,24 +10791,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
-
- /*
- * Set host-state according to L0's settings (vmcs12 is irrelevant here)
- * Some constant fields are set here by vmx_set_constant_host_state().
- * Other fields are different per CPU, and will be set later when
- * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
- */
- vmx_set_constant_host_state(vmx);
-
- /*
- * Set the MSR load/store lists to match L0's settings.
- */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
-
/*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
* entry, but only if the current (host) sp changed from the value
@@ -10630,8 +10822,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
/*
- * Merging of IO bitmap not currently supported.
- * Rather, exit every time.
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
*/
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
@@ -10668,12 +10860,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
- set_cr4_guest_host_mask(vmx);
-
- if (from_vmentry &&
- vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vmcs_write64(TSC_OFFSET,
vcpu->arch.tsc_offset + vmcs12->tsc_offset);
@@ -10692,16 +10878,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* even if spawn a lot of nested vCPUs.
*/
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
}
} else {
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx_flush_tlb(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
-
}
if (enable_pml) {
@@ -10750,6 +10933,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);
+ if (vmx->nested.dirty_vmcs12) {
+ prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+ vmx->nested.dirty_vmcs12 = false;
+ }
+
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
@@ -10758,16 +10946,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
- /*
- * L1 may access the L2's PDPTR, so save them to construct vmcs12
- */
- if (enable_ept) {
- vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
- vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
- vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
- vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
- }
-
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
return 0;
@@ -10903,20 +11081,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11128,7 +11301,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if (block_nested_events)
return -EBUSY;
nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
- vcpu->arch.exception.pending = false;
return 0;
}
@@ -11409,11 +11581,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* L1's vpid. TODO: move to a more elaborate solution, giving
* each L2 its own vpid and exposing the vpid feature to L1.
*/
- vmx_flush_tlb(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
- /* Restore posted intr vector. */
- if (nested_cpu_has_posted_intr(vmcs12))
- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11485,7 +11654,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
@@ -11534,10 +11703,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11678,6 +11843,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ */
+ if (info->intercept == x86_intercept_rdtscp &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* TODO: check more intercepts... */
return X86EMUL_CONTINUE;
}
@@ -12191,6 +12371,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.handle_external_intr = vmx_handle_external_intr,
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
+ .umip_emulated = vmx_umip_emulated,
.check_nested_events = vmx_check_nested_events,
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
new file mode 100644
index 000000000000..cd0c75f6d037
--- /dev/null
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -0,0 +1,77 @@
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_full.
+ */
+
+/*
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs_read_any and vmcs_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
+SHADOW_FIELD_RW(GUEST_INTR_STATUS)
+SHADOW_FIELD_RW(GUEST_PML_INDEX)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
+SHADOW_FIELD_RW(TPR_THRESHOLD)
+SHADOW_FIELD_RW(GUEST_CS_LIMIT)
+SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS)
+SHADOW_FIELD_RW(GUEST_RIP)
+SHADOW_FIELD_RW(GUEST_RSP)
+SHADOW_FIELD_RW(GUEST_CR0)
+SHADOW_FIELD_RW(GUEST_CR3)
+SHADOW_FIELD_RW(GUEST_CR4)
+SHADOW_FIELD_RW(GUEST_RFLAGS)
+SHADOW_FIELD_RW(GUEST_CS_BASE)
+SHADOW_FIELD_RW(GUEST_ES_BASE)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
+SHADOW_FIELD_RW(CR0_READ_SHADOW)
+SHADOW_FIELD_RW(CR4_READ_SHADOW)
+SHADOW_FIELD_RW(HOST_FS_BASE)
+SHADOW_FIELD_RW(HOST_GS_BASE)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c53298dfbf50..c8a0b545ac20 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -67,6 +67,8 @@
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -177,7 +179,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
- { "efer_reload", VCPU_STAT(efer_reload) },
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
@@ -702,7 +703,8 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
!vcpu->guest_xcr0_loaded) {
/* kvm_set_xcr() also depends on this */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
vcpu->guest_xcr0_loaded = 1;
}
}
@@ -794,6 +796,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -1009,6 +1014,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
};
static unsigned num_msrs_to_save;
@@ -1036,6 +1042,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MCG_CTL,
MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
};
@@ -1377,6 +1384,11 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
+static inline int gtod_is_based_on_tsc(int mode)
+{
+ return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+}
+
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
@@ -1396,7 +1408,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
* perform request to enable masterclock.
*/
if (ka->use_master_clock ||
- (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
+ (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
@@ -1459,6 +1471,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
vcpu->arch.tsc_offset = offset;
}
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * TSC is marked unstable when we're running on Hyper-V,
+ * 'TSC page' clocksource is good.
+ */
+ if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+ return false;
+#endif
+ return check_tsc_unstable();
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1504,7 +1529,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
*/
if (synchronizing &&
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
- if (!check_tsc_unstable()) {
+ if (!kvm_check_tsc_unstable()) {
offset = kvm->arch.cur_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
@@ -1604,18 +1629,43 @@ static u64 read_tsc(void)
return last;
}
-static inline u64 vgettsc(u64 *cycle_now)
+static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
{
long v;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 tsc_pg_val;
+
+ switch (gtod->clock.vclock_mode) {
+ case VCLOCK_HVCLOCK:
+ tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp);
+ if (tsc_pg_val != U64_MAX) {
+ /* TSC page valid */
+ *mode = VCLOCK_HVCLOCK;
+ v = (tsc_pg_val - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ } else {
+ /* TSC page invalid */
+ *mode = VCLOCK_NONE;
+ }
+ break;
+ case VCLOCK_TSC:
+ *mode = VCLOCK_TSC;
+ *tsc_timestamp = read_tsc();
+ v = (*tsc_timestamp - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ break;
+ default:
+ *mode = VCLOCK_NONE;
+ }
- *cycle_now = read_tsc();
+ if (*mode == VCLOCK_NONE)
+ *tsc_timestamp = v = 0;
- v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
return v * gtod->clock.mult;
}
-static int do_monotonic_boot(s64 *t, u64 *cycle_now)
+static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -1624,9 +1674,8 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
do {
seq = read_seqcount_begin(&gtod->seq);
- mode = gtod->clock.vclock_mode;
ns = gtod->nsec_base;
- ns += vgettsc(cycle_now);
+ ns += vgettsc(tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -1635,7 +1684,7 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
return mode;
}
-static int do_realtime(struct timespec *ts, u64 *cycle_now)
+static int do_realtime(struct timespec *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -1644,10 +1693,9 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now)
do {
seq = read_seqcount_begin(&gtod->seq);
- mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->nsec_base;
- ns += vgettsc(cycle_now);
+ ns += vgettsc(tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -1657,25 +1705,26 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now)
return mode;
}
-/* returns true if host is using tsc clocksource */
-static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
- if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+ return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using tsc clocksource */
+/* returns true if host is using TSC based clocksource */
static bool kvm_get_walltime_and_clockread(struct timespec *ts,
- u64 *cycle_now)
+ u64 *tsc_timestamp)
{
/* checked again under seqlock below */
- if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+ return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
}
#endif
@@ -2118,6 +2167,12 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
vcpu->arch.pv_time_enabled = false;
}
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+}
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
@@ -2127,7 +2182,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
- vcpu->arch.st.steal.preempted = 0;
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb(vcpu, false);
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1; /* first time write, random junk */
@@ -2228,6 +2288,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.smbase = data;
break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
@@ -2502,6 +2567,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vcpu->arch.smbase;
break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
+ break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
msr_info->data = 1000ULL;
@@ -2869,13 +2937,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
- if (check_tsc_unstable()) {
+ if (kvm_check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
@@ -2904,7 +2972,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- vcpu->arch.st.steal.preempted = 1;
+ vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
&vcpu->arch.st.steal.preempted,
@@ -2938,12 +3006,18 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
pagefault_enable();
kvm_x86_ops->vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
+ /*
+ * If userspace has set any breakpoints or watchpoints, dr6 is restored
+ * on every vmexit, but if not, we might have a stale dr6 from the
+ * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+ */
+ set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+ if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -3472,6 +3546,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void *buffer;
} u;
+ vcpu_load(vcpu);
+
u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
@@ -3497,8 +3573,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
- if (IS_ERR(u.lapic))
- return PTR_ERR(u.lapic);
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
+ goto out_nofree;
+ }
r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
break;
@@ -3672,8 +3750,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XSAVE: {
u.xsave = memdup_user(argp, sizeof(*u.xsave));
- if (IS_ERR(u.xsave))
- return PTR_ERR(u.xsave);
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out_nofree;
+ }
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
@@ -3695,8 +3775,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XCRS: {
u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
- if (IS_ERR(u.xcrs))
- return PTR_ERR(u.xcrs);
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out_nofree;
+ }
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
@@ -3740,6 +3822,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
out:
kfree(u.buffer);
+out_nofree:
+ vcpu_put(vcpu);
return r;
}
@@ -4237,13 +4321,14 @@ set_identity_unlock:
mutex_unlock(&kvm->lock);
break;
case KVM_XEN_HVM_CONFIG: {
+ struct kvm_xen_hvm_config xhc;
r = -EFAULT;
- if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
- sizeof(struct kvm_xen_hvm_config)))
+ if (copy_from_user(&xhc, argp, sizeof(xhc)))
goto out;
r = -EINVAL;
- if (kvm->arch.xen_hvm_config.flags)
+ if (xhc.flags)
goto out;
+ memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
r = 0;
break;
}
@@ -4295,6 +4380,36 @@ set_identity_unlock:
r = kvm_vm_ioctl_enable_cap(kvm, &cap);
break;
}
+ case KVM_MEMORY_ENCRYPT_OP: {
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_op)
+ r = kvm_x86_ops->mem_enc_op(kvm, argp);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_REG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_reg_region)
+ r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_unreg_region)
+ r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -5703,7 +5818,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
* handle watchpoints yet, those would be handled in
* the emulate_ops.
*/
- if (kvm_vcpu_check_breakpoint(vcpu, &r))
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
return r;
ctxt->interruptibility = 0;
@@ -5889,6 +6005,43 @@ static void tsc_khz_changed(void *data)
__this_cpu_write(cpu_tsc_khz, khz);
}
+#ifdef CONFIG_X86_64
+static void kvm_hyperv_tsc_notifier(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int cpu;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_make_mclock_inprogress_request(kvm);
+
+ hyperv_stop_tsc_emulation();
+
+ /* TSC frequency always matches when on Hyper-V */
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ kvm_max_guest_tsc_khz = tsc_khz;
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ }
+ spin_unlock(&kvm_lock);
+}
+#endif
+
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -6110,9 +6263,9 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk);
/* disable master clock if host does not trust, or does not
- * use, TSC clocksource
+ * use, TSC based clocksource.
*/
- if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
queue_work(system_long_wq, &pvclock_gtod_work);
@@ -6174,6 +6327,9 @@ int kvm_arch_init(void *opaque)
kvm_lapic_init();
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
return 0;
@@ -6186,6 +6342,10 @@ out:
void kvm_arch_exit(void)
{
+#ifdef CONFIG_X86_64
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ clear_hv_tscchange_cb();
+#endif
kvm_lapic_exit();
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
@@ -6448,6 +6608,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->queue_exception(vcpu);
} else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
@@ -6749,7 +6910,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+ if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -6758,12 +6919,6 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
-}
-
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end)
{
@@ -6832,7 +6987,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_vcpu_flush_tlb(vcpu);
+ kvm_vcpu_flush_tlb(vcpu, true);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
@@ -6981,10 +7136,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* This handles the case where a posted interrupt was
* notified with kvm_vcpu_kick.
*/
- if (kvm_lapic_enabled(vcpu)) {
- if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
- }
+ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
|| need_resched() || signal_pending(current)) {
@@ -7005,7 +7158,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
+ if (lapic_timer_advance_ns)
+ wait_lapic_expire(vcpu);
guest_enter_irqoff();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -7266,8 +7420,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
+ vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
-
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -7314,11 +7468,14 @@ out:
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
+ vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
/*
* We are here if userspace calls get_regs() in the middle of
@@ -7352,11 +7509,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -7386,6 +7546,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -7404,6 +7565,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
{
struct desc_ptr dt;
+ vcpu_load(vcpu);
+
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7435,12 +7598,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ vcpu_load(vcpu);
+
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
vcpu->arch.pv.pv_unhalted)
@@ -7448,21 +7614,26 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int ret = -EINVAL;
+
+ vcpu_load(vcpu);
+
if (!lapic_in_kernel(vcpu) &&
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
- return -EINVAL;
+ goto out;
/* INITs are latched while in SMM */
if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- return -EINVAL;
+ goto out;
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
@@ -7470,7 +7641,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
} else
vcpu->arch.mp_state = mp_state->mp_state;
kvm_make_request(KVM_REQ_EVENT, vcpu);
- return 0;
+
+ ret = 0;
+out:
+ vcpu_put(vcpu);
+ return ret;
}
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
@@ -7524,18 +7699,21 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int mmu_reset_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
+ int ret = -EINVAL;
+
+ vcpu_load(vcpu);
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
(sregs->cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
+ goto out;
if (kvm_valid_sregs(vcpu, sregs))
- return -EINVAL;
+ goto out;
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))
- return -EINVAL;
+ goto out;
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
@@ -7601,7 +7779,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
- return 0;
+ ret = 0;
+out:
+ vcpu_put(vcpu);
+ return ret;
}
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -7610,6 +7791,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
+ vcpu_load(vcpu);
+
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
@@ -7655,7 +7838,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
out:
-
+ vcpu_put(vcpu);
return r;
}
@@ -7669,6 +7852,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
+ vcpu_load(vcpu);
+
idx = srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -7677,14 +7862,17 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
tr->writeable = 1;
tr->usermode = 0;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxregs_state *fxsave =
- &vcpu->arch.guest_fpu.state.fxsave;
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
fpu->fsw = fxsave->swd;
@@ -7694,13 +7882,17 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxregs_state *fxsave =
- &vcpu->arch.guest_fpu.state.fxsave;
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -7711,6 +7903,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
return 0;
}
@@ -7767,7 +7960,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
- if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
printk_once(KERN_WARNING
"kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
@@ -7779,16 +7972,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
- int r;
-
kvm_vcpu_mtrr_init(vcpu);
- r = vcpu_load(vcpu);
- if (r)
- return r;
+ vcpu_load(vcpu);
kvm_vcpu_reset(vcpu, false);
kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
- return r;
+ return 0;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -7798,13 +7987,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_hv_vcpu_postcreate(vcpu);
- if (vcpu_load(vcpu))
+ if (mutex_lock_killable(&vcpu->mutex))
return;
+ vcpu_load(vcpu);
msr.data = 0x0;
msr.index = MSR_IA32_TSC;
msr.host_initiated = true;
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ mutex_unlock(&vcpu->mutex);
if (!kvmclock_periodic_sync)
return;
@@ -7815,11 +8006,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int r;
vcpu->arch.apf.msr_val = 0;
- r = vcpu_load(vcpu);
- BUG_ON(r);
+ vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@@ -7831,6 +8020,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.hflags = 0;
vcpu->arch.smi_pending = 0;
+ vcpu->arch.smi_count = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7924,7 +8114,7 @@ int kvm_arch_hardware_enable(void)
return ret;
local_tsc = rdtsc();
- stable = !check_tsc_unstable();
+ stable = !kvm_check_tsc_unstable();
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!stable && vcpu->cpu == smp_processor_id())
@@ -8190,9 +8380,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
- int r;
- r = vcpu_load(vcpu);
- BUG_ON(r);
+ vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d0b95b7a90b4..b91215d1fd80 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -12,6 +12,7 @@
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = false;
}
@@ -265,36 +266,8 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
static inline bool kvm_mwait_in_guest(void)
{
- unsigned int eax, ebx, ecx, edx;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
- return false;
-
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- /* All AMD CPUs have a working MWAIT implementation */
- return true;
- case X86_VENDOR_INTEL:
- /* Handle Intel below */
- break;
- default:
- return false;
- }
-
- /*
- * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
- * they would allow guest to stop the CPU completely by disabling
- * interrupts then invoking MWAIT.
- */
- if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
- return false;
-
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
-
- if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
- return false;
-
- return true;
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR);
}
#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 69a473919260..91e9700cc6dc 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RETPOLINE) += retpoline.o
OBJECT_FILES_NON_STANDARD_retpoline.o :=y
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
index d6f848d1211d..2dd1fe13a37b 100644
--- a/arch/x86/lib/cpu.c
+++ b/arch/x86/lib/cpu.c
@@ -18,7 +18,7 @@ unsigned int x86_model(unsigned int sig)
{
unsigned int fam, model;
- fam = x86_family(sig);
+ fam = x86_family(sig);
model = (sig >> 4) & 0xf;
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
new file mode 100644
index 000000000000..3cdf06128d13
--- /dev/null
+++ b/arch/x86/lib/error-inject.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+asmlinkage void just_return_func(void);
+
+asm(
+ ".type just_return_func, @function\n"
+ ".globl just_return_func\n"
+ "just_return_func:\n"
+ " ret\n"
+ ".size just_return_func, .-just_return_func\n"
+);
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ regs->ip = (unsigned long)&just_return_func;
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index c97d935a29e8..49b167f73215 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -40,6 +40,8 @@ ENTRY(__get_user_1)
mov PER_CPU_VAR(current_task), %_ASM_DX
cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
ASM_STAC
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
@@ -54,6 +56,8 @@ ENTRY(__get_user_2)
mov PER_CPU_VAR(current_task), %_ASM_DX
cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
ASM_STAC
2: movzwl -1(%_ASM_AX),%edx
xor %eax,%eax
@@ -68,6 +72,8 @@ ENTRY(__get_user_4)
mov PER_CPU_VAR(current_task), %_ASM_DX
cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
ASM_STAC
3: movl -3(%_ASM_AX),%edx
xor %eax,%eax
@@ -83,6 +89,8 @@ ENTRY(__get_user_8)
mov PER_CPU_VAR(current_task), %_ASM_DX
cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
ASM_STAC
4: movq -7(%_ASM_AX),%rdx
xor %eax,%eax
@@ -94,6 +102,8 @@ ENTRY(__get_user_8)
mov PER_CPU_VAR(current_task), %_ASM_DX
cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user_8
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
ASM_STAC
4: movl -7(%_ASM_AX),%edx
5: movl -3(%_ASM_AX),%ecx
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1b377f734e64..7add8ba06887 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -331,12 +331,12 @@ do { \
unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
{
- stac();
+ __uaccess_begin_nospec();
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel(to, from, n);
- clac();
+ __uaccess_end();
return n;
}
EXPORT_SYMBOL(__copy_user_ll);
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
{
- stac();
+ __uaccess_begin_nospec();
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
n = __copy_user_intel_nocache(to, from, n);
@@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
#else
__copy_user(to, from, n);
#endif
- clac();
+ __uaccess_end();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 135c9a7898c7..79cb066f40c0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -829,23 +829,24 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+ bool want_memblock)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size)
+int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
zone = page_zone(pfn_to_page(start_pfn));
- return __remove_pages(zone, start_pfn, nr_pages);
+ return __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4a837289f2ad..8b72923f1d35 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -256,7 +256,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one(vaddr);
+ __flush_tlb_one_kernel(vaddr);
}
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
@@ -772,12 +772,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-int add_pages(int nid, unsigned long start_pfn,
- unsigned long nr_pages, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap, bool want_memblock)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -787,24 +787,24 @@ int add_pages(int nid, unsigned long start_pfn,
return ret;
}
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+ bool want_memblock)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
init_memory_mapping(start, start + size);
- return add_pages(nid, start_pfn, nr_pages, want_memblock);
+ return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
}
-EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
-static void __meminit free_pagetable(struct page *page, int order)
+static void __meminit free_pagetable(struct page *page, int order,
+ struct vmem_altmap *altmap)
{
unsigned long magic;
unsigned int nr_pages = 1 << order;
- struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
if (altmap) {
vmem_altmap_free(altmap, nr_pages);
@@ -826,7 +826,8 @@ static void __meminit free_pagetable(struct page *page, int order)
free_pages((unsigned long)page_address(page), order);
}
-static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd,
+ struct vmem_altmap *altmap)
{
pte_t *pte;
int i;
@@ -838,13 +839,14 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
}
/* free a pte talbe */
- free_pagetable(pmd_page(*pmd), 0);
+ free_pagetable(pmd_page(*pmd), 0, altmap);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
}
-static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud,
+ struct vmem_altmap *altmap)
{
pmd_t *pmd;
int i;
@@ -856,13 +858,14 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
}
/* free a pmd talbe */
- free_pagetable(pud_page(*pud), 0);
+ free_pagetable(pud_page(*pud), 0, altmap);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
spin_unlock(&init_mm.page_table_lock);
}
-static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d,
+ struct vmem_altmap *altmap)
{
pud_t *pud;
int i;
@@ -874,7 +877,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
}
/* free a pud talbe */
- free_pagetable(p4d_page(*p4d), 0);
+ free_pagetable(p4d_page(*p4d), 0, altmap);
spin_lock(&init_mm.page_table_lock);
p4d_clear(p4d);
spin_unlock(&init_mm.page_table_lock);
@@ -882,7 +885,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
- bool direct)
+ struct vmem_altmap *altmap, bool direct)
{
unsigned long next, pages = 0;
pte_t *pte;
@@ -913,7 +916,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
* freed when offlining, or simplely not in use.
*/
if (!direct)
- free_pagetable(pte_page(*pte), 0);
+ free_pagetable(pte_page(*pte), 0, altmap);
spin_lock(&init_mm.page_table_lock);
pte_clear(&init_mm, addr, pte);
@@ -936,7 +939,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
page_addr = page_address(pte_page(*pte));
if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
- free_pagetable(pte_page(*pte), 0);
+ free_pagetable(pte_page(*pte), 0, altmap);
spin_lock(&init_mm.page_table_lock);
pte_clear(&init_mm, addr, pte);
@@ -953,7 +956,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
static void __meminit
remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
- bool direct)
+ bool direct, struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pte_t *pte_base;
@@ -972,7 +975,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
free_pagetable(pmd_page(*pmd),
- get_order(PMD_SIZE));
+ get_order(PMD_SIZE),
+ altmap);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
@@ -986,7 +990,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
if (!memchr_inv(page_addr, PAGE_INUSE,
PMD_SIZE)) {
free_pagetable(pmd_page(*pmd),
- get_order(PMD_SIZE));
+ get_order(PMD_SIZE),
+ altmap);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
@@ -998,8 +1003,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next, direct);
- free_pte_table(pte_base, pmd);
+ remove_pte_table(pte_base, addr, next, altmap, direct);
+ free_pte_table(pte_base, pmd, altmap);
}
/* Call free_pmd_table() in remove_pud_table(). */
@@ -1009,7 +1014,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
static void __meminit
remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
- bool direct)
+ struct vmem_altmap *altmap, bool direct)
{
unsigned long next, pages = 0;
pmd_t *pmd_base;
@@ -1028,7 +1033,8 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
IS_ALIGNED(next, PUD_SIZE)) {
if (!direct)
free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
+ get_order(PUD_SIZE),
+ altmap);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
@@ -1042,7 +1048,8 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!memchr_inv(page_addr, PAGE_INUSE,
PUD_SIZE)) {
free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
+ get_order(PUD_SIZE),
+ altmap);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
@@ -1054,8 +1061,8 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
}
pmd_base = pmd_offset(pud, 0);
- remove_pmd_table(pmd_base, addr, next, direct);
- free_pmd_table(pmd_base, pud);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
+ free_pmd_table(pmd_base, pud, altmap);
}
if (direct)
@@ -1064,7 +1071,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
static void __meminit
remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
- bool direct)
+ struct vmem_altmap *altmap, bool direct)
{
unsigned long next, pages = 0;
pud_t *pud_base;
@@ -1080,14 +1087,14 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
BUILD_BUG_ON(p4d_large(*p4d));
pud_base = pud_offset(p4d, 0);
- remove_pud_table(pud_base, addr, next, direct);
+ remove_pud_table(pud_base, addr, next, altmap, direct);
/*
* For 4-level page tables we do not want to free PUDs, but in the
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
if (CONFIG_PGTABLE_LEVELS == 5)
- free_pud_table(pud_base, p4d);
+ free_pud_table(pud_base, p4d, altmap);
}
if (direct)
@@ -1096,7 +1103,8 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
/* start and end are both virtual address. */
static void __meminit
-remove_pagetable(unsigned long start, unsigned long end, bool direct)
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next;
unsigned long addr;
@@ -1111,15 +1119,16 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
continue;
p4d = p4d_offset(pgd, 0);
- remove_p4d_table(p4d, addr, next, direct);
+ remove_p4d_table(p4d, addr, next, altmap, direct);
}
flush_tlb_all();
}
-void __ref vmemmap_free(unsigned long start, unsigned long end)
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
- remove_pagetable(start, end, false);
+ remove_pagetable(start, end, false, altmap);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -1129,24 +1138,22 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
- remove_pagetable(start, end, true);
+ remove_pagetable(start, end, true, NULL);
}
-int __ref arch_remove_memory(u64 start, u64 size)
+int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn);
- struct vmem_altmap *altmap;
struct zone *zone;
int ret;
/* With altmap the first mapped page is offset from @start */
- altmap = to_vmem_altmap((unsigned long) page);
if (altmap)
page += vmem_altmap_offset(altmap);
zone = page_zone(page);
- ret = __remove_pages(zone, start_pfn, nr_pages);
+ ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
WARN_ON_ONCE(ret);
kernel_physical_mapping_remove(start, start + size);
@@ -1186,8 +1193,8 @@ void __init mem_init(void)
register_page_bootmem_info();
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
- PAGE_SIZE, KCORE_OTHER);
+ if (get_gate_vma(&init_mm))
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
mem_init_print_info(NULL);
}
@@ -1378,7 +1385,10 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (pmd_none(*pmd)) {
void *p;
- p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (altmap)
+ p = altmap_alloc_block_buf(PMD_SIZE, altmap);
+ else
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
if (p) {
pte_t entry;
@@ -1411,9 +1421,9 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
return 0;
}
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
- struct vmem_altmap *altmap = to_vmem_altmap(start);
int err;
if (boot_cpu_has(X86_FEATURE_PSE))
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c45b6ec5357b..e2db83bebc3b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -820,5 +820,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
- __flush_tlb_one(addr);
+ __flush_tlb_one_kernel(addr);
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 58477ec3d66d..7c8686709636 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -168,7 +168,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
- __flush_tlb_one(f->addr);
+ __flush_tlb_one_kernel(f->addr);
return 0;
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index e1d61e8500f9..1a53071e2e17 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,7 +15,7 @@
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
#include <linux/swiotlb.h>
#include <linux/mem_encrypt.h>
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 4d434ddb75db..2c1ecf4763c4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/io.h>
-#include <linux/kallsyms.h>
#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
#include <asm/e820/api.h> /* for ISA_START_ADDRESS */
@@ -123,8 +122,8 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
addr, my_reason->addr);
print_pte(addr);
- print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
- print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+ pr_emerg("faulting IP is at %pS\n", (void *)regs->ip);
+ pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip);
#ifdef __i386__
pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index fe7d57a8fb60..1555bd7d3449 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -678,6 +678,25 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
}
/**
+ * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
+ * of @pfn cannot be overridden by UC MTRR memory type.
+ *
+ * Only to be called when PAT is enabled.
+ *
+ * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
+ * Returns false in other cases.
+ */
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
+{
+ enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
+
+ return cm == _PAGE_CACHE_MODE_UC ||
+ cm == _PAGE_CACHE_MODE_UC_MINUS ||
+ cm == _PAGE_CACHE_MODE_WC;
+}
+EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+
+/**
* io_reserve_memtype - Request a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index c3c5274410a9..9bb7f0ab9fe6 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -63,7 +63,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one(vaddr);
+ __flush_tlb_one_kernel(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5bfe61a5e8e3..7f1a51399674 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
-#include <linux/debugfs.h>
/*
* TLB flushing, formerly SMP-only
@@ -228,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);
+ /*
+ * The membarrier system call requires a full memory barrier and
+ * core serialization before returning to user-space, after
+ * storing to rq->curr. Writing to CR3 provides that full
+ * memory barrier and core serializing instruction.
+ */
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
@@ -247,6 +254,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
} else {
u16 new_asid;
bool need_flush;
+ u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
+
+ /*
+ * Avoid user/user BTB poisoning by flushing the branch
+ * predictor when switching between processes. This stops
+ * one process from doing Spectre-v2 attacks on another.
+ *
+ * As an optimization, flush indirect branches only when
+ * switching into processes that disable dumping. This
+ * protects high value processes like gpg, without having
+ * too high performance overhead. IBPB is *expensive*!
+ *
+ * This will not flush branches when switching into kernel
+ * threads. It will also not flush if we switch to idle
+ * thread and back to the same process. It will flush if we
+ * switch to a different non-dumpable process.
+ */
+ if (tsk && tsk->mm &&
+ tsk->mm->context.ctx_id != last_ctx_id &&
+ get_dumpable(tsk->mm) != SUID_DUMP_USER)
+ indirect_branch_prediction_barrier();
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -292,6 +320,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
@@ -369,6 +405,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
+ this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
@@ -461,7 +498,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* flush that changes context.tlb_gen from 2 to 3. If they get
* processed on this CPU in reverse order, we'll see
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
- * If we were to use __flush_tlb_single() and set local_tlb_gen to
+ * If we were to use __flush_tlb_one_user() and set local_tlb_gen to
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
@@ -482,7 +519,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
addr = f->start;
while (addr < f->end) {
- __flush_tlb_single(addr);
+ __flush_tlb_one_user(addr);
addr += PAGE_SIZE;
}
if (local)
@@ -629,7 +666,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_one(addr);
+ __flush_tlb_one_kernel(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0554e8aef4d5..4923d92f918d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -15,8 +15,6 @@
#include <asm/set_memory.h>
#include <linux/bpf.h>
-int bpf_jit_enable __read_mostly;
-
/*
* assembly code in arch/x86/net/bpf_jit.S
*/
@@ -154,6 +152,11 @@ static bool is_ereg(u32 reg)
BIT(BPF_REG_AX));
}
+static bool is_axreg(u32 reg)
+{
+ return reg == BPF_REG_0;
+}
+
/* add modifiers if 'reg' maps to x64 registers r8..r15 */
static u8 add_1mod(u8 byte, u32 reg)
{
@@ -447,16 +450,36 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
+ /* b3 holds 'normal' opcode, b2 short form only valid
+ * in case dst is eax/rax.
+ */
switch (BPF_OP(insn->code)) {
- case BPF_ADD: b3 = 0xC0; break;
- case BPF_SUB: b3 = 0xE8; break;
- case BPF_AND: b3 = 0xE0; break;
- case BPF_OR: b3 = 0xC8; break;
- case BPF_XOR: b3 = 0xF0; break;
+ case BPF_ADD:
+ b3 = 0xC0;
+ b2 = 0x05;
+ break;
+ case BPF_SUB:
+ b3 = 0xE8;
+ b2 = 0x2D;
+ break;
+ case BPF_AND:
+ b3 = 0xE0;
+ b2 = 0x25;
+ break;
+ case BPF_OR:
+ b3 = 0xC8;
+ b2 = 0x0D;
+ break;
+ case BPF_XOR:
+ b3 = 0xF0;
+ b2 = 0x35;
+ break;
}
if (is_imm8(imm32))
EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+ else if (is_axreg(dst_reg))
+ EMIT1_off32(b2, imm32);
else
EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
break;
@@ -545,26 +568,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
*/
EMIT2(0x31, 0xd2);
- if (BPF_SRC(insn->code) == BPF_X) {
- /* if (src_reg == 0) return 0 */
-
- /* cmp r11, 0 */
- EMIT4(0x49, 0x83, 0xFB, 0x00);
-
- /* jne .+9 (skip over pop, pop, xor and jmp) */
- EMIT2(X86_JNE, 1 + 1 + 2 + 5);
- EMIT1(0x5A); /* pop rdx */
- EMIT1(0x58); /* pop rax */
- EMIT2(0x31, 0xc0); /* xor eax, eax */
-
- /* jmp cleanup_addr
- * addrs[i] - 11, because there are 11 bytes
- * after this insn: div, mov, pop, pop, mov
- */
- jmp_offset = ctx->cleanup_addr - (addrs[i] - 11);
- EMIT1_off32(0xE9, jmp_offset);
- }
-
if (BPF_CLASS(insn->code) == BPF_ALU64)
/* div r11 */
EMIT3(0x49, 0xF7, 0xF3);
@@ -1109,19 +1112,29 @@ common_load:
return proglen;
}
+struct x64_jit_data {
+ struct bpf_binary_header *header;
+ int *addrs;
+ u8 *image;
+ int proglen;
+ struct jit_context ctx;
+};
+
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
+ struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
+ bool extra_pass = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;
- if (!bpf_jit_enable)
+ if (!prog->jit_requested)
return orig_prog;
tmp = bpf_jit_blind_constants(prog);
@@ -1135,10 +1148,28 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog = tmp;
}
+ jit_data = prog->aux->jit_data;
+ if (!jit_data) {
+ jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+ if (!jit_data) {
+ prog = orig_prog;
+ goto out;
+ }
+ prog->aux->jit_data = jit_data;
+ }
+ addrs = jit_data->addrs;
+ if (addrs) {
+ ctx = jit_data->ctx;
+ oldproglen = jit_data->proglen;
+ image = jit_data->image;
+ header = jit_data->header;
+ extra_pass = true;
+ goto skip_init_addrs;
+ }
addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
- goto out;
+ goto out_addrs;
}
/* Before first pass, make a rough estimation of addrs[]
@@ -1149,6 +1180,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
+skip_init_addrs:
/* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large bpf programs
@@ -1189,7 +1221,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
bpf_flush_icache(header, image + proglen);
- bpf_jit_binary_lock_ro(header);
+ if (!prog->is_func || extra_pass) {
+ bpf_jit_binary_lock_ro(header);
+ } else {
+ jit_data->addrs = addrs;
+ jit_data->ctx = ctx;
+ jit_data->proglen = proglen;
+ jit_data->image = image;
+ jit_data->header = header;
+ }
prog->bpf_func = (void *)image;
prog->jited = 1;
prog->jited_len = proglen;
@@ -1197,8 +1237,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog = orig_prog;
}
+ if (!prog->is_func || extra_pass) {
out_addrs:
- kfree(addrs);
+ kfree(addrs);
+ kfree(jit_data);
+ prog->aux->jit_data = NULL;
+ }
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0452629148be..52e55108404e 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -839,7 +839,8 @@ static void __init pirq_find_router(struct irq_router *r)
DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
rt->rtr_vendor, rt->rtr_device);
- pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
+ pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
+ rt->rtr_devfn);
if (!pirq_router_dev) {
DBG(KERN_DEBUG "PCI: Interrupt router not found at "
"%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 53d600217973..75577c1490c4 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -26,6 +26,7 @@
#include <linux/pci_ids.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <linux/dma-direct.h>
#include <asm/iommu.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index c4b3646bd04c..9542a746dc50 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -409,10 +409,8 @@ int __init pci_xen_init(void)
pcibios_enable_irq = xen_pcifront_enable_irq;
pcibios_disable_irq = NULL;
-#ifdef CONFIG_ACPI
/* Keep ACPI out of the picture */
- acpi_noirq = 1;
-#endif
+ acpi_noirq_set();
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index a952ac199741..6f37a2137a79 100644
--- a/arch/x86/platform/intel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -218,14 +218,23 @@ int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(iosf_mbi_register_pmic_bus_access_notifier);
+int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+ struct notifier_block *nb)
+{
+ iosf_mbi_assert_punit_acquired();
+
+ return blocking_notifier_chain_unregister(
+ &iosf_mbi_pmic_bus_access_notifier, nb);
+}
+EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier_unlocked);
+
int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb)
{
int ret;
/* Wait for the bus to go inactive before unregistering */
mutex_lock(&iosf_mbi_punit_mutex);
- ret = blocking_notifier_chain_unregister(
- &iosf_mbi_pmic_bus_access_notifier, nb);
+ ret = iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(nb);
mutex_unlock(&iosf_mbi_punit_mutex);
return ret;
@@ -239,6 +248,12 @@ int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v)
}
EXPORT_SYMBOL(iosf_mbi_call_pmic_bus_access_notifier_chain);
+void iosf_mbi_assert_punit_acquired(void)
+{
+ WARN_ON(!mutex_is_locked(&iosf_mbi_punit_mutex));
+}
+EXPORT_SYMBOL(iosf_mbi_assert_punit_acquired);
+
#ifdef CONFIG_IOSF_MBI_DEBUG
static u32 dbg_mdr;
static u32 dbg_mcr;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c2e9285d1bf1..db77e087adaf 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
local_flush_tlb();
stat->d_alltlb++;
} else {
- __flush_tlb_single(msg->address);
+ __flush_tlb_one_user(msg->address);
stat->d_onetlb++;
}
stat->d_requestee++;
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index c35fdb585c68..afc4ed7b1578 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -145,7 +145,7 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
#endif
}
-int swsusp_arch_resume(void)
+asmlinkage int swsusp_arch_resume(void)
{
int error;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f910c514438f..0ef5e5204968 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -174,7 +174,7 @@ out:
return 0;
}
-int swsusp_arch_resume(void)
+asmlinkage int swsusp_arch_resume(void)
{
int error;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d85076223a69..aae88fec9941 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1300,12 +1300,12 @@ static void xen_flush_tlb(void)
preempt_enable();
}
-static void xen_flush_tlb_single(unsigned long addr)
+static void xen_flush_tlb_one_user(unsigned long addr)
{
struct mmuext_op *op;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb_single(addr);
+ trace_xen_mmu_flush_tlb_one_user(addr);
preempt_disable();
@@ -2370,7 +2370,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_others = xen_flush_tlb_others,
.pgd_alloc = xen_pgd_alloc,
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 13b4f19b9131..159a897151d6 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -694,6 +694,9 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
int i, ret = 0;
pte_t *pte;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
if (kmap_ops) {
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
kmap_ops, count);
@@ -736,6 +739,9 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
{
int i, ret = 0;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
for (i = 0; i < count; i++) {
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 497cc55a0c16..96f26e026783 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -9,7 +9,9 @@
#include <asm/boot.h>
#include <asm/asm.h>
+#include <asm/msr.h>
#include <asm/page_types.h>
+#include <asm/percpu.h>
#include <asm/unwind_hints.h>
#include <xen/interface/elfnote.h>
@@ -35,6 +37,20 @@ ENTRY(startup_xen)
mov %_ASM_SI, xen_start_info
mov $init_thread_union+THREAD_SIZE, %_ASM_SP
+#ifdef CONFIG_X86_64
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ movq $INIT_PER_CPU_VAR(irq_stack_union),%rax
+ cdq
+ wrmsr
+#endif
+
jmp xen_start_kernel
END(startup_xen)
__FINIT