diff options
Diffstat (limited to 'arch/x86')
157 files changed, 4234 insertions, 2612 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..867e7936dbc5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,8 +103,8 @@ config X86 select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_LTO_CLANG if X86_64 - select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @@ -113,6 +113,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_NO_INSTR select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 @@ -1597,7 +1598,7 @@ config NODES_SHIFT default "10" if MAXSMP default "6" if X86_64 default "3" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. @@ -1693,35 +1694,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. -config X86_RESERVE_LOW - int "Amount of low memory, in kilobytes, to reserve for the BIOS" - default 64 - range 4 640 - help - Specify the amount of low memory to reserve for the BIOS. - - The first page contains BIOS data structures that the kernel - must not use, so that page must always be reserved. - - By default we reserve the first 64K of physical RAM, as a - number of BIOSes are known to corrupt that memory range - during events such as suspend/resume or monitor cable - insertion, so it must not be used by the kernel. - - You can set this to 4 if you are absolutely sure that you - trust the BIOS to get all its memory reservations and usages - right. If you know your BIOS have problems beyond the - default 64K area, you can set this to 640 to avoid using the - entire low memory range. - - If you have doubts about the BIOS (e.g. suspend/resume does - not work or there's kernel crashes after certain hardware - hotplug events) then you might want to enable - X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check - typical corruption patterns. - - Leave this to the default value of 64 if you are unsure. - config MATH_EMULATION bool depends on MODIFY_LDT_SYSCALL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cb5e8d39cac1..53eceaf71ab7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -257,7 +257,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/ boot := arch/x86/boot -BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage +BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage PHONY += bzImage $(BOOT_TARGETS) @@ -315,8 +315,9 @@ define archhelp echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' hdimage - Create a BIOS/EFI hard disk image (arch/x86/boot/hdimage)' echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' - echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' bzdisk/fdimage*/hdimage/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' echo ' FDINITRD=file initrd for the booted kernel' echo '' diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 9cc7f1357b9b..1189be057ebd 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -11,3 +11,4 @@ setup.elf fdimage mtools.conf image.iso +hdimage diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index fe605205b4ce..dfbc26a8e924 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -29,7 +29,7 @@ KCOV_INSTRUMENT := n SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage -targets += fdimage fdimage144 fdimage288 image.iso mtools.conf +targets += fdimage fdimage144 fdimage288 image.iso hdimage subdir- := compressed setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o @@ -115,47 +115,49 @@ $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed $@ # Set this if you want to pass append arguments to the -# bzdisk/fdimage/isoimage kernel +# bzdisk/fdimage/hdimage/isoimage kernel FDARGS = -# Set this if you want an initrd included with the -# bzdisk/fdimage/isoimage kernel +# Set this if you want one or more initrds included in the image FDINITRD = -image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) +imgdeps = $(obj)/bzImage $(obj)/mtools.conf $(src)/genimage.sh $(obj)/mtools.conf: $(src)/mtools.conf.in sed -e 's|@OBJ@|$(obj)|g' < $< > $@ +targets += mtools.conf + +# genimage.sh requires bash, but it also has a bunch of other +# external dependencies. quiet_cmd_genimage = GENIMAGE $3 -cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ - $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) +cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD) -PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install +PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage install # This requires write access to /dev/fd0 -bzdisk: $(obj)/bzImage $(obj)/mtools.conf +# All images require syslinux to be installed; hdimage also requires +# EDK2/OVMF if the kernel is compiled with the EFI stub. +bzdisk: $(imgdeps) $(call cmd,genimage,bzdisk,/dev/fd0) -# These require being root or having syslinux 2.02 or higher installed -fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf +fdimage fdimage144: $(imgdeps) $(call cmd,genimage,fdimage144,$(obj)/fdimage) @$(kecho) 'Kernel: $(obj)/fdimage is ready' -fdimage288: $(obj)/bzImage $(obj)/mtools.conf +fdimage288: $(imgdeps) $(call cmd,genimage,fdimage288,$(obj)/fdimage) @$(kecho) 'Kernel: $(obj)/fdimage is ready' -isoimage: $(obj)/bzImage +hdimage: $(imgdeps) + $(call cmd,genimage,hdimage,$(obj)/hdimage) + @$(kecho) 'Kernel: $(obj)/hdimage is ready' + +isoimage: $(imgdeps) $(call cmd,genimage,isoimage,$(obj)/image.iso) @$(kecho) 'Kernel: $(obj)/image.iso is ready' -bzlilo: - if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi - if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi - cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz - cp System.map $(INSTALL_PATH)/ - if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi - install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + $(CONFIG_SHELL) $(srctree)/$(src)/install.sh \ + $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 6a10d52a4145..0673fdfc1a11 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive @@ -8,15 +8,24 @@ # # Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others # -# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture +# "make fdimage/fdimage144/fdimage288/hdimage/isoimage" +# script for x86 architecture # # Arguments: -# $1 - fdimage format -# $2 - target image file -# $3 - kernel bzImage file -# $4 - mtool configuration file -# $5 - kernel cmdline -# $6 - inird image file +# $1 - fdimage format +# $2 - target image file +# $3 - kernel bzImage file +# $4 - mtools configuration file +# $5 - kernel cmdline +# $6+ - initrd image file(s) +# +# This script requires: +# bash +# syslinux +# mtools (for fdimage* and hdimage) +# edk2/OVMF (for hdimage) +# +# Otherwise try to stick to POSIX shell commands... # # Use "make V=1" to debug this script @@ -26,105 +35,237 @@ case "${KBUILD_VERBOSE}" in ;; esac -verify () { - if [ ! -f "$1" ]; then - echo "" 1>&2 - echo " *** Missing file: $1" 1>&2 - echo "" 1>&2 - exit 1 +# Exit the top-level shell with an error +topshell=$$ +trap 'exit 1' USR1 +die() { + echo "" 1>&2 + echo " *** $*" 1>&2 + echo "" 1>&2 + kill -USR1 $topshell +} + +# Verify the existence and readability of a file +verify() { + if [ ! -f "$1" -o ! -r "$1" ]; then + die "Missing file: $1" fi } +diskfmt="$1" +FIMAGE="$2" +FBZIMAGE="$3" +MTOOLSRC="$4" +KCMDLINE="$5" +shift 5 # Remaining arguments = initrd files + +export MTOOLSRC -export MTOOLSRC=$4 -FIMAGE=$2 -FBZIMAGE=$3 -KCMDLINE=$5 -FDINITRD=$6 +# common options for dd +dd='dd iflag=fullblock' # Make sure the files actually exist verify "$FBZIMAGE" -genbzdisk() { - verify "$MTOOLSRC" - mformat a: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - a:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" a:initrd.img +declare -a FDINITRDS +irdpfx=' initrd=' +initrdopts_syslinux='' +initrdopts_efi='' +for f in "$@"; do + if [ -f "$f" -a -r "$f" ]; then + FDINITRDS=("${FDINITRDS[@]}" "$f") + fname="$(basename "$f")" + initrdopts_syslinux="${initrdopts_syslinux}${irdpfx}${fname}" + irdpfx=, + initrdopts_efi="${initrdopts_efi} initrd=${fname}" fi - mcopy $FBZIMAGE a:linux +done + +# Read a $3-byte littleendian unsigned value at offset $2 from file $1 +le() { + local n=0 + local m=1 + for b in $(od -A n -v -j $2 -N $3 -t u1 "$1"); do + n=$((n + b*m)) + m=$((m * 256)) + done + echo $n } -genfdimage144() { - verify "$MTOOLSRC" - dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null - mformat v: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - v:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" v:initrd.img - fi - mcopy $FBZIMAGE v:linux +# Get the EFI architecture name such that boot{name}.efi is the default +# boot file name. Returns false with no output if the file is not an +# EFI image or otherwise unknown. +efiarch() { + [ -f "$1" ] || return + [ $(le "$1" 0 2) -eq 23117 ] || return # MZ magic + peoffs=$(le "$1" 60 4) # PE header offset + [ $peoffs -ge 64 ] || return + [ $(le "$1" $peoffs 4) -eq 17744 ] || return # PE magic + case $(le "$1" $((peoffs+4+20)) 2) in # PE type + 267) ;; # PE32 + 523) ;; # PE32+ + *) return 1 ;; # Invalid + esac + [ $(le "$1" $((peoffs+4+20+68)) 2) -eq 10 ] || return # EFI app + case $(le "$1" $((peoffs+4)) 2) in # Machine type + 332) echo i386 ;; + 450) echo arm ;; + 512) echo ia64 ;; + 20530) echo riscv32 ;; + 20580) echo riscv64 ;; + 20776) echo riscv128 ;; + 34404) echo x64 ;; + 43620) echo aa64 ;; + esac } -genfdimage288() { - verify "$MTOOLSRC" - dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null - mformat w: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - W:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" w:initrd.img - fi - mcopy $FBZIMAGE w:linux +# Get the combined sizes in bytes of the files given, counting sparse +# files as full length, and padding each file to a 4K block size +filesizes() { + local t=0 + local s + for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do + t=$((t + ((s+4095)/4096)*4096)) + done + echo $t } -geniso() { - tmp_dir=`dirname $FIMAGE`/isoimage - rm -rf $tmp_dir - mkdir $tmp_dir - for i in lib lib64 share ; do - for j in syslinux ISOLINUX ; do - if [ -f /usr/$i/$j/isolinux.bin ] ; then - isolinux=/usr/$i/$j/isolinux.bin - fi +# Expand directory names which should be in /usr/share into a list +# of possible alternatives +sharedirs() { + local dir file + for dir in /usr/share /usr/lib64 /usr/lib; do + for file; do + echo "$dir/$file" + echo "$dir/${file^^}" done - for j in syslinux syslinux/modules/bios ; do - if [ -f /usr/$i/$j/ldlinux.c32 ]; then - ldlinux=/usr/$i/$j/ldlinux.c32 - fi + done +} +efidirs() { + local dir file + for dir in /usr/share /boot /usr/lib64 /usr/lib; do + for file; do + echo "$dir/$file" + echo "$dir/${file^^}" done - if [ -n "$isolinux" -a -n "$ldlinux" ] ; then - break + done +} + +findsyslinux() { + local f="$(find -L $(sharedirs syslinux isolinux) \ + -name "$1" -readable -type f -print -quit 2>/dev/null)" + if [ ! -f "$f" ]; then + die "Need a $1 file, please install syslinux/isolinux." + fi + echo "$f" + return 0 +} + +findovmf() { + local arch="$1" + shift + local -a names=(-false) + local name f + for name; do + names=("${names[@]}" -or -iname "$name") + done + for f in $(find -L $(efidirs edk2 ovmf) \ + \( "${names[@]}" \) -readable -type f \ + -print 2>/dev/null); do + if [ "$(efiarch "$f")" = "$arch" ]; then + echo "$f" + return 0 fi done - if [ -z "$isolinux" ] ; then - echo 'Need an isolinux.bin file, please install syslinux/isolinux.' - exit 1 + die "Need a $1 file for $arch, please install EDK2/OVMF." +} + +do_mcopy() { + if [ ${#FDINITRDS[@]} -gt 0 ]; then + mcopy "${FDINITRDS[@]}" "$1" + fi + if [ -n "$efishell" ]; then + mmd "$1"EFI "$1"EFI/Boot + mcopy "$efishell" "$1"EFI/Boot/boot${kefiarch}.efi fi - if [ -z "$ldlinux" ] ; then - echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' - exit 1 + if [ -n "$kefiarch" ]; then + echo linux "$KCMDLINE$initrdopts_efi" | \ + mcopy - "$1"startup.nsh fi - cp $isolinux $tmp_dir - cp $ldlinux $tmp_dir - cp $FBZIMAGE $tmp_dir/linux - echo "$KCMDLINE" > $tmp_dir/isolinux.cfg - if [ -f "$FDINITRD" ] ; then - cp "$FDINITRD" $tmp_dir/initrd.img + echo default linux "$KCMDLINE$initrdopts_syslinux" | \ + mcopy - "$1"syslinux.cfg + mcopy "$FBZIMAGE" "$1"linux +} + +genbzdisk() { + verify "$MTOOLSRC" + mformat -v 'LINUX_BOOT' a: + syslinux "$FIMAGE" + do_mcopy a: +} + +genfdimage144() { + verify "$MTOOLSRC" + $dd if=/dev/zero of="$FIMAGE" bs=1024 count=1440 2>/dev/null + mformat -v 'LINUX_BOOT' v: + syslinux "$FIMAGE" + do_mcopy v: +} + +genfdimage288() { + verify "$MTOOLSRC" + $dd if=/dev/zero of="$FIMAGE" bs=1024 count=2880 2>/dev/null + mformat -v 'LINUX_BOOT' w: + syslinux "$FIMAGE" + do_mcopy w: +} + +genhdimage() { + verify "$MTOOLSRC" + mbr="$(findsyslinux mbr.bin)" + kefiarch="$(efiarch "$FBZIMAGE")" + if [ -n "$kefiarch" ]; then + # The efishell provides command line handling + efishell="$(findovmf $kefiarch shell.efi shell${kefiarch}.efi)" + ptype='-T 0xef' # EFI system partition, no GPT fi - genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ - -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ - -boot-info-table $tmp_dir - isohybrid $FIMAGE 2>/dev/null || true - rm -rf $tmp_dir + sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell") + # Allow 1% + 1 MiB for filesystem and partition table overhead, + # syslinux, and config files + megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024))) + $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null + mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h: + $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null + mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h: + syslinux --offset $((512*512)) "$FIMAGE" + do_mcopy h: +} + +geniso() { + tmp_dir="$(dirname "$FIMAGE")/isoimage" + rm -rf "$tmp_dir" + mkdir "$tmp_dir" + isolinux=$(findsyslinux isolinux.bin) + ldlinux=$(findsyslinux ldlinux.c32) + cp "$isolinux" "$ldlinux" "$tmp_dir" + cp "$FBZIMAGE" "$tmp_dir"/linux + echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg + cp "${FDINITRDS[@]}" "$tmp_dir"/ + genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ + -quiet -o "$FIMAGE" -b isolinux.bin \ + -c boot.cat -no-emul-boot -boot-load-size 4 \ + -boot-info-table "$tmp_dir" + isohybrid "$FIMAGE" 2>/dev/null || true + rm -rf "$tmp_dir" } -case $1 in +rm -f "$FIMAGE" + +case "$diskfmt" in bzdisk) genbzdisk;; fdimage144) genfdimage144;; fdimage288) genfdimage288;; + hdimage) genhdimage;; isoimage) geniso;; - *) echo 'Unknown image format'; exit 1; + *) die "Unknown image format: $diskfmt";; esac diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in index efd6d2490c1d..9e2662d01364 100644 --- a/arch/x86/boot/mtools.conf.in +++ b/arch/x86/boot/mtools.conf.in @@ -14,4 +14,7 @@ drive v: drive w: file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter +# Hard disk +drive h: + file="@OBJ@/hdimage" partition=1 mformat_only diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 6706b6cb1d0f..38caf61cd5b7 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1500,7 +1500,7 @@ static int __init curve25519_mod_init(void) static void __exit curve25519_mod_exit(void) { if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && - (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX))) + static_branch_likely(&curve25519_use_bmi2_adx)) crypto_unregister_kpp(&curve25519_alg); } diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 08bf95dbc911..7fec5dcf6438 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -8,18 +8,8 @@ UBSAN_SANITIZE := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o += -fno-stack-protector -CFLAGS_syscall_64.o += -fno-stack-protector -CFLAGS_syscall_32.o += -fno-stack-protector -CFLAGS_syscall_x32.o += -fno-stack-protector - -CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 07a9331d55e7..a4c061fb7c6e 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/ptrace-abi.h> /* @@ -62,42 +63,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -/* The layout forms the "struct pt_regs" on the stack: */ -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ -#define R15 0*8 -#define R14 1*8 -#define R13 2*8 -#define R12 3*8 -#define RBP 4*8 -#define RBX 5*8 -/* These regs are callee-clobbered. Always saved on kernel entry. */ -#define R11 6*8 -#define R10 7*8 -#define R9 8*8 -#define R8 9*8 -#define RAX 10*8 -#define RCX 11*8 -#define RDX 12*8 -#define RSI 13*8 -#define RDI 14*8 -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ -#define ORIG_RAX 15*8 -/* Return frame for iretq */ -#define RIP 16*8 -#define CS 17*8 -#define EFLAGS 18*8 -#define RSP 19*8 -#define SS 20*8 - -#define SIZEOF_PTREGS 21*8 - -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -124,7 +90,9 @@ For 32-bit we have the following conventions - kernel is built with .if \save_ret pushq %rsi /* return address on top of stack */ .endif +.endm +.macro CLEAR_REGS /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -146,6 +114,11 @@ For 32-bit we have the following conventions - kernel is built with .endm +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 + PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret + CLEAR_REGS +.endm + .macro POP_REGS pop_rdi=1 skip_r11rcx=0 popq %r15 popq %r14 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7b2542b13ebd..6c2826417b33 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -36,61 +36,97 @@ #include <asm/irq_stack.h> #ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = sys_call_table[unr](regs); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call_table[xnr](regs); + return true; + } + return false; +} + +__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); - if (likely(nr < NR_syscalls)) { - nr = array_index_nospec(nr, NR_syscalls); - regs->ax = sys_call_table[nr](regs); -#ifdef CONFIG_X86_X32_ABI - } else if (likely((nr & __X32_SYSCALL_BIT) && - (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { - nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, - X32_NR_syscalls); - regs->ax = x32_sys_call_table[nr](regs); -#endif + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ + regs->ax = __x64_sys_ni_syscall(regs); } + instrumentation_end(); syscall_exit_to_user_mode(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) +static __always_inline int syscall_32_enter(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - return (unsigned int)regs->orig_ax; + return (int)regs->orig_ax; } /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, - unsigned int nr) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) { - if (likely(nr < IA32_NR_syscalls)) { - nr = array_index_nospec(nr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[nr](regs); + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < IA32_NR_syscalls)) { + unr = array_index_nospec(unr, IA32_NR_syscalls); + regs->ax = ia32_sys_call_table[unr](regs); + } else if (nr != -1) { + regs->ax = __ia32_sys_ni_syscall(regs); } } /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); add_random_kstack_offset(); /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. + * Subtlety here: if ptrace pokes something larger than 2^31-1 into + * orig_ax, the int return value truncates it. This matches + * the semantics of syscall_get_nr(). */ - nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); @@ -101,7 +137,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); int res; add_random_kstack_offset(); @@ -130,14 +166,13 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* User code screwed up. */ regs->ax = -EFAULT; - instrumentation_end(); local_irq_disable(); + instrumentation_end(); irqentry_exit_to_user_mode(regs); return false; } - /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ - nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + nr = syscall_enter_from_user_mode_work(regs, nr); /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); @@ -269,15 +304,16 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) irqentry_state_t state = irqentry_enter(regs); bool inhcall; + instrumentation_begin(); run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); inhcall = get_and_clear_inhcall(); if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { - instrumentation_begin(); irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { + instrumentation_end(); irqentry_exit(regs, state); } } diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a16a5294d55f..e38a4cf795d9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -107,8 +107,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rax=$-ENOSYS /* IRQs are off. */ - movq %rax, %rdi - movq %rsp, %rsi + movq %rsp, %rdi + /* Sign extend the lower 32bit as syscall numbers are treated as int */ + movslq %eax, %rsi call do_syscall_64 /* returns with IRQs disabled */ /* @@ -506,7 +507,7 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer */ - call \cfunc + call kernel_\cfunc /* * No need to switch back to the IST stack. The current stack is either @@ -517,7 +518,7 @@ SYM_CODE_START(\asmsym) /* Switch to the regular task stack */ .Lfrom_usermode_switch_stack_\@: - idtentry_body safe_stack_\cfunc, has_error_code=1 + idtentry_body user_\cfunc, has_error_code=1 _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 86eb0d89d46f..8cfc9bc73e7f 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -5,21 +5,21 @@ #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> -#include <asm/unistd.h> #include <asm/syscall.h> -#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *); +#ifdef CONFIG_IA32_EMULATION +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#else +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#endif + +#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); #include <asm/syscalls_32.h> -#undef __SYSCALL_I386 +#undef __SYSCALL -#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym, +#define __SYSCALL(nr, sym) __ia32_##sym, -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall, +__visible const sys_call_ptr_t ia32_sys_call_table[] = { #include <asm/syscalls_32.h> }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 1594ec72bcbb..be120eec1fc9 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -5,23 +5,14 @@ #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> -#include <asm/unistd.h> #include <asm/syscall.h> -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include <asm/syscalls_64.h> -#undef __SYSCALL_64 +#undef __SYSCALL -#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) __x64_##sym, -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, +asmlinkage const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index f2fe0a33bcfd..bdd0e03a1265 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -5,37 +5,14 @@ #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> -#include <asm/unistd.h> #include <asm/syscall.h> -/* - * Reuse the 64-bit entry points for the x32 versions that occupy different - * slots in the syscall table. - */ -#define __x32_sys_readv __x64_sys_readv -#define __x32_sys_writev __x64_sys_writev -#define __x32_sys_getsockopt __x64_sys_getsockopt -#define __x32_sys_setsockopt __x64_sys_setsockopt -#define __x32_sys_vmsplice __x64_sys_vmsplice -#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv -#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#include <asm/syscalls_x32.h> +#undef __SYSCALL -#define __SYSCALL_64(nr, sym) +#define __SYSCALL(nr, sym) __x64_##sym, -#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); -#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#include <asm/syscalls_64.h> -#undef __SYSCALL_X32 -#undef __SYSCALL_COMMON - -#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym, -#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, - -asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, -#include <asm/syscalls_64.h> +asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { +#include <asm/syscalls_x32.h> }; diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index d8c4f6c9eadc..5b3efed0e4e8 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -9,47 +9,54 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ syscall32 := $(src)/syscall_32.tbl syscall64 := $(src)/syscall_64.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh +offset := +prefix := quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \ + $(if $(offset),--offset $(offset)) \ + $(if $(prefix),--prefix $(prefix)) \ + $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) -syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: abis := i386 $(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_32_ia32 := i386 -syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: abis := i386 +$(out)/unistd_32_ia32.h: prefix := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_x32 := common,x32 -syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: abis := common,x32 +$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT $(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64_x32 := x32 -syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: abis := x32 +$(out)/unistd_64_x32.h: prefix := x32_ $(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) +$(out)/syscalls_32.h: abis := i386 $(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_64.h: abis := common,64 $(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_x32.h: abis := common,x32 +$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE + $(call if_changed,systbl) $(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE $(call if_changed,hypercalls) @@ -60,6 +67,7 @@ uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_X86_X32) += syscalls_x32.h syshdr-$(CONFIG_XEN) += xen-hypercalls.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4bbc267fb36b..fba2f615119a 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,7 +447,7 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 i386 quotactl_fd sys_quotactl_fd 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ce18119ea0d0..af973e400053 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,7 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh deleted file mode 100644 index cc1e63857427..000000000000 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_X86_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - max=0 - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - - max=$nr - done - - echo "" - echo "#ifdef __KERNEL__" - echo "#define __NR_${prefix}syscall_max $max" - echo "#endif" - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh deleted file mode 100644 index 929bde120d6b..000000000000 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" - -syscall_macro() { - local abi="$1" - local nr="$2" - local entry="$3" - - echo "__SYSCALL_${abi}($nr, $entry)" -} - -emit() { - local abi="$1" - local nr="$2" - local entry="$3" - local compat="$4" - - if [ "$abi" != "I386" -a -n "$compat" ]; then - echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 - exit 1 - fi - - if [ -z "$compat" ]; then - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - else - echo "#ifdef CONFIG_X86_32" - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - echo "#else" - syscall_macro "$abi" "$nr" "$compat" - echo "#endif" - fi -} - -grep '^[0-9]' "$in" | sort -n | ( - while read nr abi name entry compat; do - abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - emit "$abi" "$nr" "$entry" "$compat" - done -) > "$out" diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f71dd72ef95..1eb45139fcc6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1626,6 +1626,8 @@ static void x86_pmu_del(struct perf_event *event, int flags) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; + __set_bit(event->hw.idx, cpuc->dirty); + /* * Not a TXN, therefore cleanup properly. */ @@ -2474,6 +2476,31 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } +void perf_clear_dirty_counters(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + /* Don't need to clear the assigned counter. */ + for (i = 0; i < cpuc->n_events; i++) + __clear_bit(cpuc->assign[i], cpuc->dirty); + + if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) + return; + + for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { + /* Metrics and fake events don't have corresponding HW counters. */ + if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR)) + continue; + else if (i >= INTEL_PMC_IDX_FIXED) + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); + else + wrmsrl(x86_pmu_event_addr(i), 0); + } + + bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); +} + static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) @@ -2497,7 +2524,6 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e28892270c58..fca7a6e2242f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -280,6 +280,8 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = { INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), EVENT_EXTRA_END }; @@ -4030,8 +4032,10 @@ spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * The :ppp indicates the Precise Distribution (PDist) facility, which * is only supported on the GP counter 0. If a :ppp event which is not * available on the GP counter 0, error out. + * Exception: Instruction PDIR is only available on the fixed counter 0. */ - if (event->attr.precise_ip == 3) { + if ((event->attr.precise_ip == 3) && + !constraint_match(&fixed0_constraint, event->hw.config)) { if (c->idxmsk64 & BIT_ULL(0)) return &counter0_constraint; @@ -6015,7 +6019,13 @@ __init int intel_pmu_init(void) tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl(pmem); - if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + /* + * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default. + * TSX force abort hooks are not required on these systems. Only deploy + * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT. + */ + if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) && + !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { x86_pmu.flags |= PMU_FL_TFA; x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; @@ -6157,8 +6167,13 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; pmu->name = "cpu_core"; pmu->cpu_type = hybrid_big; - pmu->num_counters = x86_pmu.num_counters + 2; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + pmu->num_counters = x86_pmu.num_counters + 2; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + } else { + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1ec8fd311f38..8647713276a7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1187,6 +1187,9 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + u64 value = ds->pebs_event_reset[hwc->idx]; + u32 base = MSR_RELOAD_PMC0; + unsigned int idx = hwc->idx; if (!is_pebs_pt(event)) return; @@ -1196,7 +1199,12 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) cpuc->pebs_enabled |= PEBS_OUTPUT_PT; - wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]); + if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + base = MSR_RELOAD_FIXED_CTR0; + idx = hwc->idx - INTEL_PMC_IDX_FIXED; + value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx]; + } + wrmsrl(base + idx, value); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -1204,6 +1212,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + unsigned int idx = hwc->idx; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; @@ -1222,19 +1231,18 @@ void intel_pmu_pebs_enable(struct perf_event *event) } } + if (idx >= INTEL_PMC_IDX_FIXED) + idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + /* * Use auto-reload if possible to save a MSR write in the PMI. * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - unsigned int idx = hwc->idx; - - if (idx >= INTEL_PMC_IDX_FIXED) - idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); ds->pebs_event_reset[idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } else { - ds->pebs_event_reset[hwc->idx] = 0; + ds->pebs_event_reset[idx] = 0; } intel_pmu_pebs_via_pt_enable(event); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4409d2cccfda..e8453de7a964 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -731,7 +731,8 @@ void reserve_lbr_buffers(void) if (!kmem_cache || cpuc->lbr_xsave) continue; - cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL, + cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, + GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); } } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index df7b07d7fdcb..9bf4dbbc26e2 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -801,8 +801,6 @@ static void uncore_pmu_enable(struct pmu *pmu) struct intel_uncore_box *box; uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); - if (!uncore_pmu) - return; box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); if (!box) @@ -818,8 +816,6 @@ static void uncore_pmu_disable(struct pmu *pmu) struct intel_uncore_box *box; uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); - if (!uncore_pmu) - return; box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); if (!box) diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 291791002997..187d7287039c 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -92,6 +92,7 @@ struct intel_uncore_type { /* * Optional callbacks for managing mapping of Uncore units to PMONs */ + int (*get_topology)(struct intel_uncore_type *type); int (*set_mapping)(struct intel_uncore_type *type); void (*cleanup_mapping)(struct intel_uncore_type *type); }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 3a75a2c601c2..bb6eb1e5569c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -348,6 +348,13 @@ #define SKX_M2M_PCI_PMON_CTR0 0x200 #define SKX_M2M_PCI_PMON_BOX_CTL 0x258 +/* Memory Map registers device ID */ +#define SNR_ICX_MESH2IIO_MMAP_DID 0x9a2 +#define SNR_ICX_SAD_CONTROL_CFG 0x3f4 + +/* Getting I/O stack id in SAD_COTROL_CFG notation */ +#define SAD_CONTROL_STACK_ID(data) (((data) >> 4) & 0x7) + /* SNR Ubox */ #define SNR_U_MSR_PMON_CTR0 0x1f98 #define SNR_U_MSR_PMON_CTL0 0x1f91 @@ -3682,12 +3689,19 @@ static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) } static umode_t -skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, + int die, int zero_bus_pmu) { struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); - /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ - return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 0. */ + return pmu_iio_mapping_visible(kobj, attr, die, 0); } static ssize_t skx_iio_mapping_show(struct device *dev, @@ -3772,7 +3786,8 @@ static const struct attribute_group *skx_iio_attr_update[] = { NULL, }; -static int skx_iio_set_mapping(struct intel_uncore_type *type) +static int +pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) { char buf[64]; int ret; @@ -3780,7 +3795,7 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) struct attribute **attrs = NULL; struct dev_ext_attribute *eas = NULL; - ret = skx_iio_get_topology(type); + ret = type->get_topology(type); if (ret < 0) goto clear_attr_update; @@ -3807,7 +3822,7 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) eas[die].var = (void *)die; attrs[die] = &eas[die].attr.attr; } - skx_iio_mapping_group.attrs = attrs; + ag->attrs = attrs; return 0; err: @@ -3821,6 +3836,11 @@ clear_attr_update: return ret; } +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &skx_iio_mapping_group); +} + static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) { struct attribute **attr = skx_iio_mapping_group.attrs; @@ -3851,6 +3871,7 @@ static struct intel_uncore_type skx_uncore_iio = { .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, .attr_update = skx_iio_attr_update, + .get_topology = skx_iio_get_topology, .set_mapping = skx_iio_set_mapping, .cleanup_mapping = skx_iio_cleanup_mapping, }; @@ -4393,6 +4414,91 @@ static const struct attribute_group snr_uncore_iio_format_group = { .attrs = snr_uncore_iio_formats_attr, }; +static umode_t +snr_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 1. */ + return pmu_iio_mapping_visible(kobj, attr, die, 1); +} + +static struct attribute_group snr_iio_mapping_group = { + .is_visible = snr_iio_mapping_visible, +}; + +static const struct attribute_group *snr_iio_attr_update[] = { + &snr_iio_mapping_group, + NULL, +}; + +static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_mapping) +{ + u32 sad_cfg; + int die, stack_id, ret = -EPERM; + struct pci_dev *dev = NULL; + + type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), + GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) { + ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg); + if (ret) { + ret = pcibios_err_to_errno(ret); + break; + } + + die = uncore_pcibus_to_dieid(dev->bus); + stack_id = SAD_CONTROL_STACK_ID(sad_cfg); + if (die < 0 || stack_id >= type->num_boxes) { + ret = -EPERM; + break; + } + + /* Convert stack id from SAD_CONTROL to PMON notation. */ + stack_id = sad_pmon_mapping[stack_id]; + + ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number; + type->topology[die].segment = pci_domain_nr(dev->bus); + } + + if (ret) { + kfree(type->topology); + type->topology = NULL; + } + + return ret; +} + +/* + * SNR has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON + */ +enum { + SNR_QAT_PMON_ID, + SNR_CBDMA_DMI_PMON_ID, + SNR_NIS_PMON_ID, + SNR_DLB_PMON_ID, + SNR_PCIE_GEN3_PMON_ID +}; + +static u8 snr_sad_pmon_mapping[] = { + SNR_CBDMA_DMI_PMON_ID, + SNR_PCIE_GEN3_PMON_ID, + SNR_DLB_PMON_ID, + SNR_NIS_PMON_ID, + SNR_QAT_PMON_ID +}; + +static int snr_iio_get_topology(struct intel_uncore_type *type) +{ + return sad_cfg_iio_topology(type, snr_sad_pmon_mapping); +} + +static int snr_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &snr_iio_mapping_group); +} + static struct intel_uncore_type snr_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4406,6 +4512,10 @@ static struct intel_uncore_type snr_uncore_iio = { .msr_offset = SNR_IIO_MSR_OFFSET, .ops = &ivbep_uncore_msr_ops, .format_group = &snr_uncore_iio_format_group, + .attr_update = snr_iio_attr_update, + .get_topology = snr_iio_get_topology, + .set_mapping = snr_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; static struct intel_uncore_type snr_uncore_irp = { @@ -4933,6 +5043,53 @@ static struct event_constraint icx_uncore_iio_constraints[] = { EVENT_CONSTRAINT_END }; +static umode_t +icx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 5. */ + return pmu_iio_mapping_visible(kobj, attr, die, 5); +} + +static struct attribute_group icx_iio_mapping_group = { + .is_visible = icx_iio_mapping_visible, +}; + +static const struct attribute_group *icx_iio_attr_update[] = { + &icx_iio_mapping_group, + NULL, +}; + +/* + * ICX has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON + */ +enum { + ICX_PCIE1_PMON_ID, + ICX_PCIE2_PMON_ID, + ICX_PCIE3_PMON_ID, + ICX_PCIE4_PMON_ID, + ICX_PCIE5_PMON_ID, + ICX_CBDMA_DMI_PMON_ID +}; + +static u8 icx_sad_pmon_mapping[] = { + ICX_CBDMA_DMI_PMON_ID, + ICX_PCIE1_PMON_ID, + ICX_PCIE2_PMON_ID, + ICX_PCIE3_PMON_ID, + ICX_PCIE4_PMON_ID, + ICX_PCIE5_PMON_ID, +}; + +static int icx_iio_get_topology(struct intel_uncore_type *type) +{ + return sad_cfg_iio_topology(type, icx_sad_pmon_mapping); +} + +static int icx_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &icx_iio_mapping_group); +} + static struct intel_uncore_type icx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4947,6 +5104,10 @@ static struct intel_uncore_type icx_uncore_iio = { .constraints = icx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &snr_uncore_iio_format_group, + .attr_update = icx_iio_attr_update, + .get_topology = icx_iio_get_topology, + .set_mapping = icx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; static struct intel_uncore_type icx_uncore_irp = { diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ad87cb36f7c8..2bf1c7ea2758 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -229,6 +229,7 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; /* the # of events in the below arrays */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 84a1042c3b01..85feafacc445 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -764,13 +764,14 @@ static struct rapl_model model_spr = { .rapl_msrs = intel_rapl_spr_msrs, }; -static struct rapl_model model_amd_fam17h = { +static struct rapl_model model_amd_hygon = { .events = BIT(PERF_RAPL_PKG), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { + X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), @@ -803,9 +804,6 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), - X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), - X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), - X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index bb0ae4b5c00f..6952e219cba3 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -614,50 +614,3 @@ bool hv_is_isolation_supported(void) return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; } EXPORT_SYMBOL_GPL(hv_is_isolation_supported); - -/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ -bool hv_query_ext_cap(u64 cap_query) -{ - /* - * The address of the 'hv_extended_cap' variable will be used as an - * output parameter to the hypercall below and so it should be - * compatible with 'virt_to_phys'. Which means, it's address should be - * directly mapped. Use 'static' to keep it compatible; stack variables - * can be virtually mapped, making them imcompatible with - * 'virt_to_phys'. - * Hypercall input/output addresses should also be 8-byte aligned. - */ - static u64 hv_extended_cap __aligned(8); - static bool hv_extended_cap_queried; - u64 status; - - /* - * Querying extended capabilities is an extended hypercall. Check if the - * partition supports extended hypercall, first. - */ - if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) - return false; - - /* Extended capabilities do not change at runtime. */ - if (hv_extended_cap_queried) - return hv_extended_cap & cap_query; - - status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, - &hv_extended_cap); - - /* - * The query extended capabilities hypercall should not fail under - * any normal circumstances. Avoid repeatedly making the hypercall, on - * error. - */ - hv_extended_cap_queried = true; - status &= HV_HYPERCALL_RESULT_MASK; - if (status != HV_STATUS_SUCCESS) { - pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", - status); - return false; - } - - return hv_extended_cap & cap_query; -} -EXPORT_SYMBOL_GPL(hv_query_ext_cap); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index a09fc37ead9d..5e5b9fc2747f 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -203,7 +203,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset); if (error != N_TXTADDR(ex)) @@ -212,7 +212,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index b19ec8282d50..1e51650b79d7 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -3,6 +3,7 @@ generated-y += syscalls_32.h generated-y += syscalls_64.h +generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 0603c7423aca..3ad3da9a7d97 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,25 +3,26 @@ #define _ASM_X86_ASM_H #ifdef __ASSEMBLY__ -# define __ASM_FORM(x) x -# define __ASM_FORM_RAW(x) x -# define __ASM_FORM_COMMA(x) x, +# define __ASM_FORM(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, #else #include <linux/stringify.h> - -# define __ASM_FORM(x) " " __stringify(x) " " -# define __ASM_FORM_RAW(x) __stringify(x) -# define __ASM_FORM_COMMA(x) " " __stringify(x) "," +# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " " +# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__) +# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," #endif +#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;) + #ifndef __x86_64__ /* 32 bit */ -# define __ASM_SEL(a,b) __ASM_FORM(a) -# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) +# define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else /* 64 bit */ -# define __ASM_SEL(a,b) __ASM_FORM(b) -# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) +# define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ @@ -119,6 +120,8 @@ # define CC_OUT(c) [_cc_ ## c] "=qm" #endif +#ifdef __KERNEL__ + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ @@ -185,4 +188,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index f732741ad7c7..5e754e895767 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -269,6 +269,4 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) # include <asm/atomic64_64.h> #endif -#define ARCH_ATOMIC - #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 4819d5e5a335..3ba772a69cc8 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -54,11 +54,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define dma_rmb() barrier() #define dma_wmb() barrier() -#ifdef CONFIG_X86_32 -#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc") -#else -#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") -#endif +#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") + #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ac37830ae941..d0ce5cfd3ac1 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -108,7 +108,7 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -/* free ( 3*32+29) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ @@ -378,6 +378,7 @@ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ #define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index f58de66091e5..8b6bd63530dc 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params); void crash_smp_send_stop(void); -#ifdef CONFIG_KEXEC_CORE -void __init crash_reserve_low_1M(void); -#else -static inline void __init crash_reserve_low_1M(void) { } -#endif - #endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 476082a83d1c..e63cf582201f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -224,6 +224,26 @@ static inline void store_idt(struct desc_ptr *dtr) asm volatile("sidt %0":"=m" (*dtr)); } +static inline void native_gdt_invalidate(void) +{ + const struct desc_ptr invalid_gdt = { + .address = 0, + .size = 0 + }; + + native_load_gdt(&invalid_gdt); +} + +static inline void native_idt_invalidate(void) +{ + const struct desc_ptr invalid_idt = { + .address = 0, + .size = 0 + }; + + native_load_idt(&invalid_idt); +} + /* * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is * a read-only remapping. To prevent a page fault, the GDT is switched to the @@ -421,12 +441,10 @@ extern bool idt_is_f00f_address(unsigned long address); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); -extern void idt_setup_ist_traps(void); #else static inline void idt_setup_early_pf(void) { } -static inline void idt_setup_ist_traps(void) { } #endif -extern void idt_invalidate(void *addr); +extern void idt_invalidate(void); #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fdee23ea4e17..16bf4d4a8159 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); } +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" @@ -272,28 +280,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_all; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - else - XSTATE_OP(XSAVE, xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) { u64 mask = -1; diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 606f5cc579b2..f1366ce609e3 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -52,7 +52,7 @@ * Support for passing hypercall input parameter block via XMM * registers is available */ -#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) /* Support for a virtual guest idle state is available */ #define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) /* Frequency MSRs available */ @@ -61,6 +61,11 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) /* Support for debug MSRs available */ #define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +/* + * Support for returning hypercall output block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15) /* stimer Direct Mode is available */ #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) @@ -133,6 +138,15 @@ #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) +/* + * This is specific to AMD and specifies that enlightened TLB flush is + * supported. If guest opts in to this feature, ASID invalidations only + * flushes gva -> hpa mapping entries. To flush the TLB entries derived + * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace + * or HvFlushGuestPhysicalAddressList). + */ +#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22) + /* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ #define HV_PARAVISOR_PRESENT BIT(0) @@ -314,6 +328,9 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 +/* Number of XMM registers used in hypercall input/output */ +#define HV_HYPERCALL_MAX_XMM_REGISTERS 6 + struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 73d45b0dfff2..1345088e9902 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -312,8 +312,8 @@ static __always_inline void __##func(struct pt_regs *regs) */ #define DECLARE_IDTENTRY_VC(vector, func) \ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \ - __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \ - __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code) + __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \ + __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code) /** * DEFINE_IDTENTRY_IST - Emit code for IST entry points @@ -355,33 +355,24 @@ static __always_inline void __##func(struct pt_regs *regs) DEFINE_IDTENTRY_RAW_ERRORCODE(func) /** - * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler - which runs on a safe stack. + * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler + when raised from kernel mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE */ -#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func) +#define DEFINE_IDTENTRY_VC_KERNEL(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func) /** - * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler - which runs on the VC fall-back stack + * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler + when raised from user mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE */ -#define DEFINE_IDTENTRY_VC_IST(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func) - -/** - * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler - * @func: Function name of the entry point - * - * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE - */ -#define DEFINE_IDTENTRY_VC(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(func) +#define DEFINE_IDTENTRY_VC_USER(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func) #else /* CONFIG_X86_64 */ @@ -504,7 +495,7 @@ __visible noinstr void func(struct pt_regs *regs, \ .align 8 SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + .rept NR_EXTERNAL_VECTORS UNWIND_HINT_IRET_REGS 0 : .byte 0x6a, vector @@ -520,7 +511,7 @@ SYM_CODE_END(irq_entries_start) .align 8 SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + .rept NR_SYSTEM_VECTORS UNWIND_HINT_IRET_REGS 0 : .byte 0x6a, vector diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 955b06d6325a..27158436f322 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -102,7 +102,8 @@ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ #define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Willow Cove */ + +#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 889f8b1b5b7f..43dcb9284208 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -26,8 +26,8 @@ * This file enumerates the exact layout of them: */ +/* This is used as an interrupt vector when programming the APIC. */ #define NMI_VECTOR 0x02 -#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start at 0x20. @@ -84,7 +84,7 @@ */ #define IRQ_WORK_VECTOR 0xf6 -#define UV_BAU_MESSAGE 0xf5 +/* 0xf5 - unused, was UV_BAU_MESSAGE */ #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ @@ -114,6 +114,9 @@ #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif +#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) +#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) + /* * Size the maximum number of interrupts. * diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 610a05374c02..0449b125d27f 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -4,8 +4,6 @@ #define HAVE_JUMP_LABEL_BATCH -#define JUMP_LABEL_NOP_SIZE 5 - #include <asm/asm.h> #include <asm/nops.h> @@ -14,15 +12,35 @@ #include <linux/stringify.h> #include <linux/types.h> +#define JUMP_TABLE_ENTRY \ + ".pushsection __jump_table, \"aw\" \n\t" \ + _ASM_ALIGN "\n\t" \ + ".long 1b - . \n\t" \ + ".long %l[l_yes] - . \n\t" \ + _ASM_PTR "%c0 + %c1 - .\n\t" \ + ".popsection \n\t" + +#ifdef CONFIG_STACK_VALIDATION + +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) +{ + asm_volatile_goto("1:" + "jmp %l[l_yes] # objtool NOPs this \n\t" + JUMP_TABLE_ENTRY + : : "i" (key), "i" (2 | branch) : : l_yes); + + return false; +l_yes: + return true; +} + +#else + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - ".long 1b - ., %l[l_yes] - . \n\t" - _ASM_PTR "%c0 + %c1 - .\n\t" - ".popsection \n\t" + JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; @@ -30,16 +48,13 @@ l_yes: return true; } +#endif /* STACK_VALIDATION */ + static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" - ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" - "2:\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - ".long 1b - ., %l[l_yes] - . \n\t" - _ASM_PTR "%c0 + %c1 - .\n\t" - ".popsection \n\t" + "jmp %l[l_yes]\n\t" + JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; @@ -47,41 +62,7 @@ l_yes: return true; } -#else /* __ASSEMBLY__ */ - -.macro STATIC_JUMP_IF_TRUE target, key, def -.Lstatic_jump_\@: - .if \def - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .else - .byte BYTES_NOP5 - .endif - .pushsection __jump_table, "aw" - _ASM_ALIGN - .long .Lstatic_jump_\@ - ., \target - . - _ASM_PTR \key - . - .popsection -.endm - -.macro STATIC_JUMP_IF_FALSE target, key, def -.Lstatic_jump_\@: - .if \def - .byte BYTES_NOP5 - .else - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .endif - .pushsection __jump_table, "aw" - _ASM_ALIGN - .long .Lstatic_jump_\@ - ., \target - . - _ASM_PTR \key + 1 - . - .popsection -.endm +extern int arch_jump_entry_size(struct jump_entry *entry); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e7bef91cee04..a12a4987154e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -87,7 +87,10 @@ KVM_X86_OP(set_identity_map_addr) KVM_X86_OP(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP_NULL(has_wbinvd_exit) -KVM_X86_OP(write_l1_tsc_offset) +KVM_X86_OP(get_l2_tsc_offset) +KVM_X86_OP(get_l2_tsc_multiplier) +KVM_X86_OP(write_tsc_offset) +KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) @@ -106,8 +109,8 @@ KVM_X86_OP_NULL(set_hv_timer) KVM_X86_OP_NULL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) -KVM_X86_OP(pre_enter_smm) -KVM_X86_OP(pre_leave_smm) +KVM_X86_OP(enter_smm) +KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) KVM_X86_OP_NULL(mem_enc_op) KVM_X86_OP_NULL(mem_enc_reg_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c7ced0e3171..974cbfb1eefe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -85,7 +85,7 @@ #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) -#define KVM_REQ_HV_TLB_FLUSH \ +#define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) @@ -269,12 +269,36 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; /* - * the pages used as guest page table on soft mmu are tracked by - * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used - * by indirect shadow page can not be more than 15 bits. + * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page + * also includes TDP pages) to determine whether or not a page can be used in + * the given MMU context. This is a subset of the overall kvm_mmu_role to + * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating + * 2 bytes per gfn instead of 4 bytes per gfn. * - * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access, - * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. + * Indirect upper-level shadow pages are tracked for write-protection via + * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create + * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise + * gfn_track will overflow and explosions will ensure. + * + * A unique shadow page (SP) for a gfn is created if and only if an existing SP + * cannot be reused. The ability to reuse a SP is tracked by its role, which + * incorporates various mode bits and properties of the SP. Roughly speaking, + * the number of unique SPs that can theoretically be created is 2^n, where n + * is the number of bits that are used to compute the role. + * + * But, even though there are 18 bits in the mask below, not all combinations + * of modes and flags are possible. The maximum number of possible upper-level + * shadow pages for a single gfn is in the neighborhood of 2^13. + * + * - invalid shadow pages are not accounted. + * - level is effectively limited to four combinations, not 16 as the number + * bits would imply, as 4k SPs are not tracked (allowed to go unsync). + * - level is effectively unused for non-PAE paging because there is exactly + * one upper level (see 4k SP exception above). + * - quadrant is used only for non-PAE paging and is exclusive with + * gpte_is_8_bytes. + * - execonly and ad_disabled are used only for nested EPT, which makes it + * exclusive with quadrant. */ union kvm_mmu_page_role { u32 word; @@ -285,7 +309,7 @@ union kvm_mmu_page_role { unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned nxe:1; + unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; @@ -303,13 +327,26 @@ union kvm_mmu_page_role { }; }; -union kvm_mmu_extended_role { /* - * This structure complements kvm_mmu_page_role caching everything needed for - * MMU configuration. If nothing in both these structures changed, MMU - * re-configuration can be skipped. @valid bit is set on first usage so we don't - * treat all-zero structure as valid data. + * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties + * relevant to the current MMU configuration. When loading CR0, CR4, or EFER, + * including on nested transitions, if nothing in the full role changes then + * MMU re-configuration can be skipped. @valid bit is set on first usage so we + * don't treat all-zero structure as valid data. + * + * The properties that are tracked in the extended role but not the page role + * are for things that either (a) do not affect the validity of the shadow page + * or (b) are indirectly reflected in the shadow page's role. For example, + * CR4.PKE only affects permission checks for software walks of the guest page + * tables (because KVM doesn't support Protection Keys with shadow paging), and + * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. + * + * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. + * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and + * SMAP, but the MMU's permission checks for software walks need to be SMEP and + * SMAP aware regardless of CR0.WP. */ +union kvm_mmu_extended_role { u32 word; struct { unsigned int valid:1; @@ -320,7 +357,7 @@ union kvm_mmu_extended_role { unsigned int cr4_pke:1; unsigned int cr4_smap:1; unsigned int cr4_smep:1; - unsigned int maxphyaddr:6; + unsigned int cr4_la57:1; }; }; @@ -420,11 +457,6 @@ struct kvm_mmu { struct rsvd_bits_validate guest_rsvd_check; - /* Can have large pages at levels 2..last_nonleaf_level-1. */ - u8 last_nonleaf_level; - - bool nx; - u64 pdptrs[4]; /* pae */ }; @@ -543,6 +575,15 @@ struct kvm_vcpu_hv { struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); cpumask_t tlb_flush; + bool enforce_cpuid; + struct { + u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ + u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */ + u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */ + u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ + u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ + u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ + } cpuid_cache; }; /* Xen HVM per vcpu emulation context */ @@ -707,7 +748,7 @@ struct kvm_vcpu_arch { } st; u64 l1_tsc_offset; - u64 tsc_offset; + u64 tsc_offset; /* current tsc offset */ u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; @@ -721,7 +762,8 @@ struct kvm_vcpu_arch { u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; u64 msr_ia32_power_ctl; - u64 tsc_scaling_ratio; + u64 l1_tsc_scaling_ratio; + u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -829,7 +871,7 @@ struct kvm_vcpu_arch { bool l1tf_flush_l1d; /* Host CPU on which VM-entry was most recently attempted */ - unsigned int last_vmentry_cpu; + int last_vmentry_cpu; /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; @@ -851,6 +893,16 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + + /* + * Set when PDPTS were loaded directly by the userspace without + * reading the guest memory + */ + bool pdptrs_from_userspace; + +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_tdp; +#endif }; struct kvm_lpage_info { @@ -1002,7 +1054,7 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; - bool apic_access_page_done; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1062,11 +1114,19 @@ struct kvm_arch { bool exception_payload_enabled; bool bus_lock_detection_enabled; + /* + * If exit_on_emulation_error is set, and the in-kernel instruction + * emulator fails to emulate an instruction, allow userspace + * the opportunity to look at it. + */ + bool exit_on_emulation_error; /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter; + u32 hypercall_exit_enabled; + /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; @@ -1124,23 +1184,35 @@ struct kvm_arch { */ spinlock_t tdp_mmu_pages_lock; #endif /* CONFIG_X86_64 */ + + /* + * If set, rmaps have been allocated for all memslots and should be + * allocated for any newly created or modified memslots. + */ + bool memslots_have_rmaps; + +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_tdp; + spinlock_t hv_root_tdp_lock; +#endif }; struct kvm_vm_stat { - ulong mmu_shadow_zapped; - ulong mmu_pte_write; - ulong mmu_pde_zapped; - ulong mmu_flooded; - ulong mmu_recycled; - ulong mmu_cache_miss; - ulong mmu_unsync; - ulong remote_tlb_flush; - ulong lpages; - ulong nx_lpage_splits; - ulong max_mmu_page_hash_collisions; + struct kvm_vm_stat_generic generic; + u64 mmu_shadow_zapped; + u64 mmu_pte_write; + u64 mmu_pde_zapped; + u64 mmu_flooded; + u64 mmu_recycled; + u64 mmu_cache_miss; + u64 mmu_unsync; + u64 lpages; + u64 nx_lpage_splits; + u64 max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 pf_fixed; u64 pf_guest; u64 tlb_flush; @@ -1154,10 +1226,6 @@ struct kvm_vcpu_stat { u64 nmi_window_exits; u64 l1d_flush; u64 halt_exits; - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_invalid; - u64 halt_wakeup; u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; @@ -1168,11 +1236,10 @@ struct kvm_vcpu_stat { u64 irq_injections; u64 nmi_injections; u64 req_event; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; + u64 guest_mode; }; struct x86_instruction_info; @@ -1304,8 +1371,10 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - /* Returns actual tsc_offset set in active VMCS */ - u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); + u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); /* * Retrieve somewhat arbitrary exit information. Intended to be used @@ -1363,8 +1432,8 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); + int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); @@ -1423,6 +1492,7 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; +extern bool __read_mostly enable_apicv; extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ @@ -1463,6 +1533,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -1477,7 +1548,6 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); -bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1650,6 +1720,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free); +void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1662,7 +1733,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); -void kvm_apicv_init(struct kvm *kvm, bool enable); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); @@ -1675,8 +1745,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, - bool skip_mmu_sync); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, int tdp_huge_page_level); @@ -1788,8 +1857,10 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr) return kvm_find_user_return_msr(msr) >= 0; } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); @@ -1863,4 +1934,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) int kvm_cpu_dirty_log_size(void); +int alloc_all_memslots_rmaps(struct kvm *kvm); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ddfb3cad8dff..0607ec4f5091 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -305,7 +305,7 @@ extern void apei_mce_report_mem_error(int corrected, /* These may be used by multiple smca_hwid_mcatypes */ enum smca_bank_types { SMCA_LS = 0, /* Load Store */ - SMCA_LS_V2, /* Load Store */ + SMCA_LS_V2, SMCA_IF, /* Instruction Fetch */ SMCA_L2_CACHE, /* L2 Cache */ SMCA_DE, /* Decoder Unit */ @@ -314,17 +314,22 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ - SMCA_CS_V2, /* Coherent Slave */ + SMCA_CS_V2, SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ + SMCA_UMC_V2, SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ - SMCA_PSP_V2, /* Platform Security Processor */ + SMCA_PSP_V2, SMCA_SMU, /* System Management Unit */ - SMCA_SMU_V2, /* System Management Unit */ + SMCA_SMU_V2, SMCA_MP5, /* Microprocessor 5 Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ + SMCA_PCIE_V2, + SMCA_XGMI_PCS, /* xGMI PCS Unit */ + SMCA_XGMI_PHY, /* xGMI PHY Unit */ + SMCA_WAFL_PHY, /* WAFL PHY Unit */ N_SMCA_BANK_TYPES }; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 211ba3375ee9..a7c413432b33 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -772,6 +772,10 @@ #define MSR_TFA_RTM_FORCE_ABORT_BIT 0 #define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT) +#define MSR_TFA_TSX_CPUID_CLEAR_BIT 1 +#define MSR_TFA_TSX_CPUID_CLEAR BIT_ULL(MSR_TFA_TSX_CPUID_CLEAR_BIT) +#define MSR_TFA_SDV_ENABLE_RTM_BIT 2 +#define MSR_TFA_SDV_ENABLE_RTM BIT_ULL(MSR_TFA_SDV_ENABLE_RTM_BIT) /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index c1e5e818ba16..c5573eaa5bb9 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H +#include <asm/asm.h> + /* * Define nops for use with alternative() and for tracing. */ @@ -57,20 +59,14 @@ #endif /* CONFIG_64BIT */ -#ifdef __ASSEMBLY__ -#define _ASM_MK_NOP(x) .byte x -#else -#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" -#endif - -#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) +#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1) +#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2) +#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3) +#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4) +#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5) +#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6) +#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) +#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) #define ASM_NOP_MAX 8 diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 7555b48803a8..4d5810c8fab7 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index ca840fec7776..4bde0dc66100 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -75,7 +75,7 @@ void copy_page(void *to, void *from); * * With page table isolation enabled, we map the LDT in ... [stay tuned] */ -static inline unsigned long task_size_max(void) +static __always_inline unsigned long task_size_max(void) { unsigned long ret; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 544f41a179fb..8fc1b5003713 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -478,6 +478,7 @@ struct x86_pmu_lbr { extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); +extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); #else static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index f8cb8af4de5c..fe5efbcba824 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 556b2b17c3e2..364d0e42e280 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -663,6 +663,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 629c3df243f0..2cef6c5a52c2 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -9,8 +9,13 @@ #define __ASM_X86_SEV_COMMON_H #define GHCB_MSR_INFO_POS 0 -#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) +#define GHCB_DATA_LOW 12 +#define GHCB_MSR_INFO_MASK (BIT_ULL(GHCB_DATA_LOW) - 1) +#define GHCB_DATA(v) \ + (((unsigned long)(v) & ~GHCB_MSR_INFO_MASK) >> GHCB_DATA_LOW) + +/* SEV Information Request/Response */ #define GHCB_MSR_SEV_INFO_RESP 0x001 #define GHCB_MSR_SEV_INFO_REQ 0x002 #define GHCB_MSR_VER_MAX_POS 48 @@ -28,6 +33,7 @@ #define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) #define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) +/* CPUID Request/Response */ #define GHCB_MSR_CPUID_REQ 0x004 #define GHCB_MSR_CPUID_RESP 0x005 #define GHCB_MSR_CPUID_FUNC_POS 32 @@ -45,6 +51,14 @@ (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) +/* AP Reset Hold */ +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 + +/* GHCB Hypervisor Feature Request/Response */ +#define GHCB_MSR_HV_FT_REQ 0x080 +#define GHCB_MSR_HV_FT_RESP 0x081 + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 9c31e0ebc55b..05f3e21f01a7 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -13,7 +13,7 @@ /* * This file contains both data structures defined by SGX architecture and Linux * defined software data structures and functions. The two should not be mixed - * together for better readibility. The architectural definitions come first. + * together for better readability. The architectural definitions come first. */ /* The SGX specific CPUID function. */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index b6ffe58c70fa..24a8d6c4fb18 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -11,7 +11,7 @@ * The same segment is shared by percpu area and stack canary. On * x86_64, percpu symbols are zero based and %gs (64-bit) points to the * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the approproate + * fixed_percpu_data which contains stack_canary at the appropriate * offset. On x86_32, the stack canary is just a regular percpu * variable. * diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 772e60efe243..e322676039f4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -156,6 +156,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ + u8 reserved_8[720]; + /* + * Offset 0x3e0, 32 bytes reserved + * for use by hypervisor/software. + */ + u8 reserved_sw[32]; }; @@ -314,7 +320,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) @@ -326,7 +332,6 @@ static inline void __unused_size_checks(void) struct vmcb { struct vmcb_control_area control; - u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; } __packed; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7cbf733d11af..f7e2d82d24fb 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -21,13 +21,12 @@ extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) #define ia32_sys_call_table sys_call_table -#endif - -#if defined(CONFIG_IA32_EMULATION) +#else +/* + * These may not exist, but still put the prototypes in so we + * can use IS_ENABLED(). + */ extern const sys_call_ptr_t ia32_sys_call_table[]; -#endif - -#ifdef CONFIG_X86_X32_ABI extern const sys_call_ptr_t x32_sys_call_table[]; #endif @@ -160,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(unsigned long nr, struct pt_regs *regs); +void do_syscall_64(struct pt_regs *regs, int nr); void do_int80_syscall_32(struct pt_regs *regs); long do_fast_syscall_32(struct pt_regs *regs); diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 80c08c7d5e72..6a2827d0681f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -17,7 +17,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * __x64_sys_*() - 64-bit native syscall * __ia32_sys_*() - 32-bit native syscall or common compat syscall * __ia32_compat_sys_*() - 32-bit compat syscall - * __x32_compat_sys_*() - 64-bit X32 compat syscall + * __x64_compat_sys_*() - 64-bit X32 compat syscall * * The registers are decoded according to the ABI: * 64-bit: RDI, RSI, RDX, R10, R8, R9 @@ -166,17 +166,17 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * with x86_64 obviously do not need such care. */ #define __X32_COMPAT_SYS_STUB0(name) \ - __SYS_STUB0(x32, compat_sys_##name) + __SYS_STUB0(x64, compat_sys_##name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ - __SYS_STUBx(x32, compat_sys##name, \ + __SYS_STUBx(x64, compat_sys##name, \ SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__)) #define __X32_COMPAT_COND_SYSCALL(name) \ - __COND_SYSCALL(x32, compat_sys_##name) + __COND_SYSCALL(x64, compat_sys_##name) #define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x32, compat_sys_##name) + __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32 */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index c1c3d31b15c0..80e9d5206a71 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -13,7 +13,7 @@ # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT -# define __NR_ia32_syscall_max __NR_syscall_max +# define IA32_NR_syscalls (__NR_syscalls) # else @@ -26,12 +26,12 @@ # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +# define X32_NR_syscalls (__NR_x32_syscalls) +# define IA32_NR_syscalls (__NR_ia32_syscalls) # endif -# define NR_syscalls (__NR_syscall_max + 1) -# define X32_NR_syscalls (__NR_x32_syscall_max + 1) -# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) +# define NR_syscalls (__NR_syscalls) # define __ARCH_WANT_NEW_STAT # define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 5fdfcb47000f..054604aba9f0 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -2,10 +2,12 @@ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H +#include <linux/const.h> + /* MONITOR/MWAIT enabled in Ring 3 */ -#define HWCAP2_RING3MWAIT (1 << 0) +#define HWCAP2_RING3MWAIT _BITUL(0) /* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) +#define HWCAP2_FSGSBASE _BITUL(1) #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0662f644aad9..a6c327f8ad9e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -159,6 +159,19 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; +struct kvm_sregs2 { + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 flags; + __u64 pdptrs[4]; +}; +#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1 + /* for KVM_GET_FPU and KVM_SET_FPU */ struct kvm_fpu { __u8 fpr[8][16]; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 950afebfba88..5146bbab84d4 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -33,6 +33,8 @@ #define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_FEATURE_MSI_EXT_DEST_ID 15 +#define KVM_FEATURE_HC_MAP_GPA_RANGE 16 +#define KVM_FEATURE_MIGRATION_CONTROL 17 #define KVM_HINTS_REALTIME 0 @@ -54,6 +56,7 @@ #define MSR_KVM_POLL_CONTROL 0x4b564d05 #define MSR_KVM_ASYNC_PF_INT 0x4b564d06 #define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 +#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08 struct kvm_steal_time { __u64 steal; @@ -90,6 +93,16 @@ struct kvm_clock_pairing { /* MSR_KVM_ASYNC_PF_INT */ #define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +/* MSR_KVM_MIGRATION_CONTROL */ +#define KVM_MIGRATION_READY (1 << 0) + +/* KVM_HC_MAP_GPA_RANGE */ +#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0 +#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0) +#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1) +#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4) +#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1) +#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 554f75fe013c..efa969325ede 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -110,6 +110,9 @@ #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +/* Exit code reserved for hypervisor/software use */ +#define SVM_EXIT_SW 0xf0000000 + #define SVM_EXIT_ERR -1 #define SVM_EXIT_REASONS \ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e90310cbe73a..e55e0c1fad8c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -5,6 +5,7 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> */ +#define pr_fmt(fmt) "ACPI: " fmt #include <linux/init.h> #include <linux/acpi.h> @@ -42,8 +43,6 @@ EXPORT_SYMBOL(acpi_disabled); # include <asm/proto.h> #endif /* X86 */ -#define PREFIX "ACPI: " - int acpi_noirq; /* skip ACPI IRQ initialization */ static int acpi_nobgrt; /* skip ACPI BGRT */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ @@ -130,15 +129,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) madt = (struct acpi_table_madt *)table; if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + pr_warn("Unable to map MADT\n"); return -ENODEV; } if (madt->address) { acpi_lapic_addr = (u64) madt->address; - printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", - madt->address); + pr_debug("Local APIC address 0x%08x\n", madt->address); } default_acpi_madt_oem_check(madt->header.oem_id, @@ -161,7 +159,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) int cpu; if (id >= MAX_LOCAL_APIC) { - printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + pr_info("skipped apicid that is too big\n"); return -EINVAL; } @@ -213,13 +211,13 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) */ if (!apic->apic_id_valid(apic_id)) { if (enabled) - pr_warn(PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); return 0; } acpi_register_lapic(apic_id, processor->uid, enabled); #else - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); #endif return 0; @@ -306,7 +304,7 @@ acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, acpi_table_print_madt_entry(&header->common); if (x2apic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } @@ -324,7 +322,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e acpi_table_print_madt_entry(&header->common); if (lapic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } @@ -514,14 +512,14 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header, if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 override ignored.\n"); + pr_warn("BIOS IRQ0 override ignored.\n"); return 0; } if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; - printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } } @@ -597,7 +595,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) if (old == new) return; - printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + pr_warn("setting ELCR to %04x (from %04x)\n", new, old); outb(new, 0x4d0); outb(new >> 8, 0x4d1); } @@ -754,7 +752,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { - pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); + pr_info("Unable to map lapic to logical cpu number\n"); return cpu; } @@ -870,8 +868,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { - printk(KERN_WARNING PREFIX "HPET timers must be located in " - "memory.\n"); + pr_warn("HPET timers must be located in memory.\n"); return -1; } @@ -883,9 +880,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * want to allocate a resource there. */ if (!hpet_address) { - printk(KERN_WARNING PREFIX - "HPET id: %#x base: %#lx is invalid\n", - hpet_tbl->id, hpet_address); + pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address); return 0; } #ifdef CONFIG_X86_64 @@ -896,21 +891,17 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) */ if (hpet_address == 0xfed0000000000000UL) { if (!hpet_force_user) { - printk(KERN_WARNING PREFIX "HPET id: %#x " - "base: 0xfed0000000000000 is bogus\n " - "try hpet=force on the kernel command line to " - "fix it up to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n", + hpet_tbl->id); hpet_address = 0; return 0; } - printk(KERN_WARNING PREFIX - "HPET id: %#x base: 0xfed0000000000000 fixed up " - "to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n", + hpet_tbl->id); hpet_address >>= 32; } #endif - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); /* * Allocate and initialize the HPET firmware resource for adding into @@ -955,24 +946,24 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { - pr_debug("ACPI: no legacy devices present\n"); + pr_debug("no legacy devices present\n"); x86_platform.legacy.devices.pnpbios = 0; } if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { - pr_debug("ACPI: i8042 controller is absent\n"); + pr_debug("i8042 controller is absent\n"); x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - pr_debug("ACPI: not registering RTC platform device\n"); + pr_debug("not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { - pr_debug("ACPI: probing for VGA not safe\n"); + pr_debug("probing for VGA not safe\n"); x86_platform.legacy.no_vga = 1; } @@ -997,8 +988,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; } if (pmtmr_ioport) - printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", - pmtmr_ioport); + pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport); #endif return 0; } @@ -1024,8 +1014,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); + pr_err("Error parsing LAPIC address override entry\n"); return count; } @@ -1057,8 +1046,7 @@ static int __init acpi_parse_madt_lapic_entries(void) sizeof(struct acpi_table_madt), madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); if (ret < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC/X2APIC entries\n"); + pr_err("Error parsing LAPIC/X2APIC entries\n"); return ret; } @@ -1066,11 +1054,11 @@ static int __init acpi_parse_madt_lapic_entries(void) x2count = madt_proc[1].count; } if (!count && !x2count) { - printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + pr_err("No LAPIC entries present\n"); /* TBD: Cleanup to allow fallback to MPS */ return -ENODEV; } else if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + pr_err("Error parsing LAPIC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1080,7 +1068,7 @@ static int __init acpi_parse_madt_lapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + pr_err("Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1139,7 +1127,7 @@ static void __init mp_config_acpi_legacy_irqs(void) } if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + pr_debug("ACPI: IRQ%d used by override.\n", i); continue; /* IRQ already used */ } @@ -1179,26 +1167,24 @@ static int __init acpi_parse_madt_ioapic_entries(void) * if "noapic" boot option, don't look for IO-APICs */ if (skip_ioapic_setup) { - printk(KERN_INFO PREFIX "Skipping IOAPIC probe " - "due to 'noapic' option.\n"); + pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, MAX_IO_APICS); if (!count) { - printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + pr_err("No IOAPIC entries present\n"); return -ENODEV; } else if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + pr_err("Error parsing IOAPIC entry\n"); return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing interrupt source overrides entry\n"); + pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1218,7 +1204,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1251,8 +1237,7 @@ static void __init early_acpi_process_madt(void) /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } @@ -1289,8 +1274,7 @@ static void __init acpi_process_madt(void) /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } else { @@ -1300,8 +1284,7 @@ static void __init acpi_process_madt(void) * Boot with "acpi=off" to use MPS on such a system. */ if (smp_found_config) { - printk(KERN_WARNING PREFIX - "No APIC-table, disabling MPS\n"); + pr_warn("No APIC-table, disabling MPS\n"); smp_found_config = 0; } } @@ -1311,11 +1294,9 @@ static void __init acpi_process_madt(void) * processors, where MPS only supports physical. */ if (acpi_lapic && acpi_ioapic) - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + pr_info("Using ACPI (MADT) for SMP configuration information\n"); else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); + pr_info("Using ACPI for processor (LAPIC) configuration information\n"); #endif return; } @@ -1323,8 +1304,7 @@ static void __init acpi_process_madt(void) static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", - d->ident); + pr_notice("%s detected: force use of acpi=noirq\n", d->ident); acpi_noirq_set(); } return 0; @@ -1333,8 +1313,7 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d) static int __init disable_acpi_pci(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", - d->ident); + pr_notice("%s detected: force use of pci=noacpi\n", d->ident); acpi_disable_pci(); } return 0; @@ -1343,11 +1322,10 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d) static int __init dmi_disable_acpi(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + pr_notice("%s detected: acpi off\n", d->ident); disable_acpi(); } else { - printk(KERN_NOTICE - "Warning: DMI blacklist says broken, but acpi forced\n"); + pr_notice("Warning: DMI blacklist says broken, but acpi forced\n"); } return 0; } @@ -1574,9 +1552,9 @@ int __init early_acpi_boot_init(void) */ if (acpi_blacklisted()) { if (acpi_force) { - printk(KERN_WARNING PREFIX "acpi=force override\n"); + pr_warn("acpi=force override\n"); } else { - printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + pr_warn("Disabling ACPI support\n"); disable_acpi(); return 1; } @@ -1692,9 +1670,7 @@ int __init acpi_mps_check(void) #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ if (acpi_disabled || acpi_noirq) { - printk(KERN_WARNING "MPS support code is not built-in.\n" - "Using acpi=off or acpi=noirq or pci=noacpi " - "may have problem\n"); + pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; } #endif diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 49ae4e1ac9cd..7de599eba7f0 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -197,7 +197,8 @@ static int __init ffh_cstate_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_INTEL && - c->x86_vendor != X86_VENDOR_AMD) + c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6fe5b44fcbc9..e9da3dc71254 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -75,7 +75,7 @@ do { \ } \ } while (0) -const unsigned char x86nops[] = +static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, @@ -301,8 +301,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 09083094eb57..23dda362dc0f 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -25,6 +25,7 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -57,6 +58,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, {} }; @@ -72,6 +74,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c06ac56eae4d..b7c003013d41 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -646,6 +646,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a1b756c49a93..a99d00393206 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1773,10 +1773,16 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif - /* Flags to clear on syscall */ + /* + * Flags to clear on syscall; clear as much as possible + * to minimize user space-kernel interference. + */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| - X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); + X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| + X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| + X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| + X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| + X86_EFLAGS_AC|X86_EFLAGS_ID); } #else /* CONFIG_X86_64 */ @@ -1938,13 +1944,12 @@ void cpu_init_exception_handling(void) /* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. We + * reload it nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void cpu_init(void) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct task_struct *cur = current; int cpu = raw_smp_processor_id(); @@ -1957,8 +1962,6 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif - setup_getcpu(cpu); - pr_debug("Initializing CPU#%d\n", cpu); if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || @@ -1970,7 +1973,6 @@ void cpu_init(void) * and set up the GDT descriptor: */ switch_to_new_gdt(cpu); - load_current_idt(); if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); @@ -1990,12 +1992,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, cur); - /* Initialize the TSS. */ - tss_setup_ist(tss); - tss_setup_io_bitmap(tss); - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - - load_TR_desc(); /* * sp0 points to the entry trampoline stack regardless of what task * is running. @@ -2017,6 +2013,18 @@ void cpu_init(void) load_fixmap_gdt(cpu); } +#ifdef CONFIG_SMP +void cpu_init_secondary(void) +{ + /* + * Relies on the BP having set-up the IDT tables, which are loaded + * on this CPU in cpu_init_exception_handling(). + */ + cpu_init_exception_handling(); + cpu_init(); +} +#endif + /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 67944128876d..95521302630d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -48,6 +48,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], enum tsx_ctrl_states { TSX_CTRL_ENABLE, TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, TSX_CTRL_NOT_SUPPORTED, }; @@ -56,6 +57,7 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); extern void tsx_enable(void); extern void tsx_disable(void); +extern void tsx_clear_cpuid(void); #else static inline void tsx_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 0bd6c74e3ba1..6d50136f7ab9 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -260,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8adffc17fa8b..8321c43554a1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -10,6 +10,7 @@ #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> +#include <linux/delay.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -41,6 +42,7 @@ enum split_lock_detect_state { sld_off = 0, sld_warn, sld_fatal, + sld_ratelimit, }; /* @@ -717,8 +719,10 @@ static void init_intel(struct cpuinfo_x86 *c) if (tsx_ctrl_state == TSX_CTRL_ENABLE) tsx_enable(); - if (tsx_ctrl_state == TSX_CTRL_DISABLE) + else if (tsx_ctrl_state == TSX_CTRL_DISABLE) tsx_disable(); + else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) + tsx_clear_cpuid(); split_lock_init(); bus_lock_init(); @@ -997,13 +1001,30 @@ static const struct { { "off", sld_off }, { "warn", sld_warn }, { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, }; +static struct ratelimit_state bld_ratelimit; + static inline bool match_option(const char *arg, int arglen, const char *opt) { - int len = strlen(opt); + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } - return len == arglen && !strncmp(arg, opt, len); + return len == arglen; } static bool split_lock_verify_msr(bool on) @@ -1082,6 +1103,15 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + if (cpu_model_supports_sld) split_lock_verify_msr(sld_state != sld_off); } @@ -1154,6 +1184,12 @@ void handle_bus_lock(struct pt_regs *regs) switch (sld_state) { case sld_off: break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; case sld_warn: pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", current->comm, current->pid, regs->ip); @@ -1259,6 +1295,10 @@ static void sld_state_show(void) " from non-WB" : ""); } break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; } } diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e486f96b3cb3..08831acc1d03 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -77,27 +77,29 @@ struct smca_bank_name { }; static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "load_store", "Load Store Unit" }, - [SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, - [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP] = { "psp", "Platform Security Processor" }, - [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU] = { "smu", "System Management Unit" }, - [SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, + [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + + /* UMC v2 is separate because both of them can exist in a single system. */ + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, + [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -155,6 +157,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, + { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) }, /* Parameter Block MCA type */ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, @@ -175,6 +178,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* PCI Express Unit MCA type */ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, + { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, + + /* xGMI PCS MCA type */ + { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, + + /* xGMI PHY MCA type */ + { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, + + /* WAFL PHY MCA type */ + { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index b58b85380ddb..0e3ae64d3b76 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -36,7 +36,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bf7fe87a7e88..22791aadc085 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + if (p->mce_vaddr != (void __user *)-1l) { force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); } else { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 22f13343b5da..01ca94f42e4e 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -236,7 +236,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) for_each_present_cpu(i) { if (i == 0) continue; - ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i); BUG_ON(ret); } @@ -252,6 +252,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) static void __init ms_hyperv_init_platform(void) { + int hv_max_functions_eax; int hv_host_info_eax; int hv_host_info_ebx; int hv_host_info_ecx; @@ -269,6 +270,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, ms_hyperv.misc_features); @@ -298,8 +301,7 @@ static void __init ms_hyperv_init_platform(void) /* * Extract host information. */ - if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= - HYPERV_CPUID_VERSION) { + if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); @@ -325,9 +327,11 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); } - if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + pr_info("Hyper-V: Nested features: 0x%x\n", + ms_hyperv.nested_features); } /* diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c4d320d02fd5..6a5f60a37219 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -70,6 +70,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * struct mon_evt - Entry in the event list of a resource * @evtid: event id * @name: name of the event + * @list: entry in &rdt_resource->evt_list */ struct mon_evt { u32 evtid; @@ -78,10 +79,13 @@ struct mon_evt { }; /** - * struct mon_data_bits - Monitoring details for each event file - * @rid: Resource id associated with the event file. + * union mon_data_bits - Monitoring details for each event file + * @priv: Used to store monitoring event data in @u + * as kernfs private data + * @rid: Resource id associated with the event file * @evtid: Event id associated with the event file * @domid: The domain to which the event file belongs + * @u: Name of the bit fields struct */ union mon_data_bits { void *priv; @@ -119,6 +123,7 @@ enum rdt_group_type { * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes * * The mode of a resource group enables control over the allowed overlap * between allocations associated with different resource groups (classes @@ -142,7 +147,7 @@ enum rdtgrp_mode { /** * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn kernlfs node for the mon_data directory + * @mon_data_kn: kernfs node for the mon_data directory * @parent: parent rdtgrp * @crdtgrp_list: child rdtgroup node list * @rmid: rmid for this rdtgroup @@ -282,11 +287,11 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting - * @prev_bw The most recent bandwidth in MBps - * @delta_bw Difference between the current and previous bandwidth - * @delta_comp Indicates whether to compute the delta_bw + * @prev_bw: The most recent bandwidth in MBps + * @delta_bw: Difference between the current and previous bandwidth + * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 chunks; @@ -456,11 +461,13 @@ struct rdt_parse_data { * @data_width: Character width of data when displaying * @domains: All domains for this resource * @cache: Cache allocation related data + * @membw: If the component has bandwidth controls, their properties. * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values * @evt_list: List of monitoring events * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @mbm_width: Monitor width, to detect and correct for overflow. * @fflags: flags to choose base and info files */ struct rdt_resource { diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 05a89e33fde2..2207916cae65 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -49,6 +49,7 @@ static struct class *pseudo_lock_class; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support * pseudo-locking. This includes testing to ensure pseudo-locked regions @@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor) } /** - * pseudo_lock_pm_req - A power management QoS request list entry + * struct pseudo_lock_pm_req - A power management QoS request list entry * @list: Entry within the @pm_reqs list for a pseudo-locked region * @req: PM QoS request */ @@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) /** * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region * * To prevent the cache from being affected by power management entering * C6 has to be avoided. This is accomplished by requesting a latency @@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) * the ACPI latencies need to be considered while keeping in mind that C2 * may be set to map to deeper sleep states. In this case the latency * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure */ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) { @@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp) /** * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @r: resource group being queried + * @rdtgrp: resource group being queried * * Return: 1 if monitor groups have been created for this resource * group, 0 otherwise. @@ -1140,6 +1144,8 @@ out: /** * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. * * The measurement of latency to access a pseudo-locked region should be * done from a cpu that is associated with that pseudo-locked region. diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 3be203297988..001808e3901c 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -383,7 +383,7 @@ const struct vm_operations_struct sgx_vm_ops = { /** * sgx_encl_release - Destroy an enclave instance - * @kref: address of a kref inside &sgx_encl + * @ref: address of a kref inside &sgx_encl * * Used together with kref_put(). Frees all the resources associated with the * enclave and the instance itself. diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 6e74f85b6264..fec43ca65065 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index e2ad30e474f8..9c7a5f049292 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -2,7 +2,7 @@ /* * Intel Transactional Synchronization Extensions (TSX) control. * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * Author: * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> @@ -84,13 +84,46 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) return TSX_CTRL_ENABLE; } +void tsx_clear_cpuid(void) +{ + u64 msr; + + /* + * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID + * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + rdmsrl(MSR_TSX_FORCE_ABORT, msr); + msr |= MSR_TFA_TSX_CPUID_CLEAR; + wrmsrl(MSR_TSX_FORCE_ABORT, msr); + } +} + void __init tsx_init(void) { char arg[5] = {}; int ret; - if (!tsx_ctrl_is_supported()) + /* + * Hardware will always abort a TSX transaction if both CPUID bits + * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is + * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them + * here. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT; + tsx_clear_cpuid(); + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); return; + } + + if (!tsx_ctrl_is_supported()) { + tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; + return; + } ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); if (ret >= 0) { diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 54ce999ed321..e8326a8d1c5d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } -/* - * When the crashkernel option is specified, only use the low - * 1M for the real mode trampoline. - */ -void __init crash_reserve_low_1M(void) -{ - if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) - return; - - memblock_reserve(0, 1<<20); - pr_info("Reserving the low 1M of memory for crashkernel\n"); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ec3ae3054792..b7b92cdf3add 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -221,28 +221,18 @@ sanitize_restored_user_xstate(union fpregs_state *state, if (use_xsave()) { /* - * Note: we don't need to zero the reserved bits in the - * xstate_header here because we either didn't copy them at all, - * or we checked earlier that they aren't set. + * Clear all feature bits which are not set in + * user_xfeatures and clear all extended features + * for fx_only mode. */ + u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures; /* - * 'user_xfeatures' might have bits clear which are - * set in header->xfeatures. This represents features that - * were in init state prior to a signal delivery, and need - * to be reset back to the init state. Clear any user - * feature bits which are set in the kernel buffer to get - * them back to the init state. - * - * Supervisor state is unchanged by input from userspace. - * Ensure supervisor state bits stay set and supervisor - * state is not modified. + * Supervisor state has to be preserved. The sigframe + * restore can only modify user features, i.e. @mask + * cannot contain them. */ - if (fx_only) - header->xfeatures = XFEATURE_MASK_FPSSE; - else - header->xfeatures &= user_xfeatures | - xfeatures_mask_supervisor(); + header->xfeatures &= mask | xfeatures_mask_supervisor(); } if (use_fxsr()) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d0eef963aad1..1cadb2faf740 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -441,12 +441,35 @@ static void __init print_xstate_offset_size(void) } /* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_PASID) + +/* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { static int on_boot_cpu __initdata = 1; + BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED) != + XFEATURES_INIT_FPSTATE_HANDLED); + WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void) copy_kernel_to_xregs_booting(&init_fpstate.xsave); /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVES because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVES is available because XSAVES uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. */ - copy_xregs_to_kernel_booting(&init_fpstate.xsave); + fxsave(&init_fpstate.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 04bddaaba8e2..d8b3ebd2bb85 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -62,7 +62,7 @@ SYM_CODE_START_NOALIGN(startup_64) */ /* Set up the stack for verify_cpu(), similar to initial_stack below */ - leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi pushq %rsi @@ -343,10 +343,10 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder * reliably detect the end of the stack. */ -SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) __FINITDATA __INIT diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d552f177eca0..df0fa695bb09 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -35,12 +35,16 @@ #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) +#ifdef CONFIG_X86_64 /* * Interrupt gate with interrupt stack. The _ist index is the index in * the tss.ist[] array, but for the descriptor it needs to start at 1. */ #define ISTG(_vector, _addr, _ist) \ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) +#else +#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr) +#endif /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -74,7 +78,7 @@ static const __initconst struct idt_data early_idts[] = { */ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), - INTG(X86_TRAP_NMI, asm_exc_nmi), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), @@ -91,12 +95,16 @@ static const __initconst struct idt_data def_idts[] = { #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, asm_exc_double_fault), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #endif - INTG(X86_TRAP_DB, asm_exc_debug), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, asm_exc_machine_check), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), #endif SYSG(X86_TRAP_OF, asm_exc_overflow), @@ -221,22 +229,6 @@ static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, asm_exc_page_fault), }; -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), -#endif -#ifdef CONFIG_AMD_MEM_ENCRYPT - ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), -#endif -}; - /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -254,14 +246,6 @@ void __init idt_setup_early_pf(void) idt_setup_from_table(idt_table, early_pf_idts, ARRAY_SIZE(early_pf_idts), true); } - -/** - * idt_setup_ist_traps - Initialize the idt table with traps using IST - */ -void __init idt_setup_ist_traps(void) -{ - idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); -} #endif static void __init idt_map_in_cea(void) @@ -331,11 +315,10 @@ void __init idt_setup_early_handler(void) /** * idt_invalidate - Invalidate interrupt descriptor table - * @addr: The virtual address of the 'invalid' IDT */ -void idt_invalidate(void *addr) +void idt_invalidate(void) { - struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + static const struct desc_ptr idt = { .address = 0, .size = 0 }; load_idt(&idt); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 6a2eb62c85e6..674906fad43b 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -15,50 +15,75 @@ #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> +#include <asm/insn.h> -static void bug_at(const void *ip, int line) +int arch_jump_entry_size(struct jump_entry *entry) { - /* - * The location is not an op that we were expecting. - * Something went wrong. Crash the box, as something could be - * corrupting the kernel. - */ - pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); - BUG(); + struct insn insn = {}; + + insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); + BUG_ON(insn.length != 2 && insn.length != 5); + + return insn.length; } -static const void * -__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) +struct jump_label_patch { + const void *code; + int size; +}; + +static struct jump_label_patch +__jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { - const void *expect, *code; + const void *expect, *code, *nop; const void *addr, *dest; - int line; + int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); - code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + size = arch_jump_entry_size(entry); + switch (size) { + case JMP8_INSN_SIZE: + code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; - if (type == JUMP_LABEL_JMP) { - expect = x86_nops[5]; line = __LINE__; - } else { - expect = code; line = __LINE__; + case JMP32_INSN_SIZE: + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + default: BUG(); } - if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) - bug_at(addr, line); + if (type == JUMP_LABEL_JMP) + expect = nop; + else + expect = code; + + if (memcmp(addr, expect, size)) { + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", + addr, addr, addr, expect, size, type); + BUG(); + } if (type == JUMP_LABEL_NOP) - code = x86_nops[5]; + code = nop; - return code; + return (struct jump_label_patch){.code = code, .size = size}; } static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { - const void *opcode = __jump_label_set_jump_code(entry, type); + const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still @@ -72,12 +97,11 @@ static inline void __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), opcode, - JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } - text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); + text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -98,7 +122,7 @@ void arch_jump_label_transform(struct jump_entry *entry, bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - const void *opcode; + struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* @@ -109,9 +133,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } mutex_lock(&text_mutex); - opcode = __jump_label_set_jump_code(entry, type); - text_poke_queue((void *)jump_entry_code(entry), - opcode, JUMP_LABEL_NOP_SIZE, NULL); + jlp = __jump_label_patch(entry, type); + text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d3d65545cb8b..c492ad3001ca 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -674,7 +674,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) break; if (insn->addr_bytes != sizeof(unsigned long)) - return -EOPNOTSUPP; /* Don't support differnt size */ + return -EOPNOTSUPP; /* Don't support different size */ if (X86_MODRM_MOD(opcode) != 3) return -EOPNOTSUPP; /* TODO: support memory addressing */ @@ -1102,24 +1102,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || - kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; } return 0; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 64b00b0d7fe8..1b373d79cedc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,17 +23,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_gdt(void *newgdt, __u16 limit) -{ - struct desc_ptr curgdt; - - /* ia32 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - load_gdt(&curgdt); -} - static void load_segments(void) { #define __STR(X) #X @@ -232,8 +221,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - idt_invalidate(phys_to_virt(0)); - set_gdt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c078b0d3ab0e..131f30fdcfbd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -256,35 +256,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaligned loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - static void load_segments(void) { __asm__ __volatile__ ( @@ -379,8 +350,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e1f38179f49..e52b208b4641 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -931,7 +931,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; - if (p == current || p->state == TASK_RUNNING) + if (p == current || task_is_running(p)) return 0; if (!try_get_task_stack(p)) @@ -975,7 +975,7 @@ unsigned long get_wchan(struct task_struct *p) goto out; } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && p->state != TASK_RUNNING); + } while (count++ < 16 && !task_is_running(p)); out: put_task_stack(p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 87a4143aa7d7..4c208ea3bd9f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -911,7 +911,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) * syscall with TS_COMPAT still set. */ regs->orig_ax = value; - if (syscall_get_nr(child, regs) >= 0) + if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index b29657b76e3f..ebfb91108232 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -669,7 +669,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - idt_invalidate(NULL); + idt_invalidate(); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1e720626069a..85acd22f8022 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -695,30 +695,6 @@ static void __init e820_add_kernel_range(void) e820__range_add(start, size, E820_TYPE_RAM); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - -static int __init parse_reservelow(char *p) -{ - unsigned long long size; - - if (!p) - return -EINVAL; - - size = memparse(p, &p); - - if (size < 4096) - size = 4096; - - if (size > 640*1024) - size = 640*1024; - - reserve_low = size; - - return 0; -} - -early_param("reservelow", parse_reservelow); - static void __init early_reserve_memory(void) { /* @@ -1084,17 +1060,18 @@ void __init setup_arch(char **cmdline_p) #endif /* - * Find free memory for the real mode trampoline and place it - * there. - * If there is not enough free memory under 1M, on EFI-enabled - * systems there will be additional attempt to reclaim the memory - * for the real mode trampoline at efi_free_boot_services(). + * Find free memory for the real mode trampoline and place it there. If + * there is not enough free memory under 1M, on EFI-enabled systems + * there will be additional attempt to reclaim the memory for the real + * mode trampoline at efi_free_boot_services(). + * + * Unconditionally reserve the entire first 1M of RAM because BIOSes + * are known to corrupt low memory and several hundred kilobytes are not + * worth complex detection what memory gets clobbered. Windows does the + * same thing for very similar reasons. * - * Unconditionally reserve the entire first 1M of RAM because - * BIOSes are know to corrupt low memory and several - * hundred kilobytes are not worth complex detection what memory gets - * clobbered. Moreover, on machines with SandyBridge graphics or in - * setups that use crashkernel the entire 1M is reserved anyway. + * Moreover, on machines with SandyBridge graphics or in setups that use + * crashkernel the entire 1M is reserved anyway. */ reserve_real_mode(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0941d2f44f2a..78a32b956e81 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ static bool __init pcpu_need_numa(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA pg_data_t *last = NULL; unsigned int cpu; @@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; @@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 651b81cd648e..a6895e440bc3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -7,12 +7,11 @@ * Author: Joerg Roedel <jroedel@suse.de> */ -#define pr_fmt(fmt) "SEV-ES: " fmt +#define pr_fmt(fmt) "SEV: " fmt #include <linux/sched/debug.h> /* For show_regs() */ #include <linux/percpu-defs.h> #include <linux/mem_encrypt.h> -#include <linux/lockdep.h> #include <linux/printk.h> #include <linux/mm_types.h> #include <linux/set_memory.h> @@ -192,11 +191,19 @@ void noinstr __sev_es_ist_exit(void) this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); } -static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) { struct sev_es_runtime_data *data; struct ghcb *ghcb; + WARN_ON(!irqs_disabled()); + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; @@ -213,7 +220,9 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) data->ghcb_active = false; data->backup_ghcb_active = false; + instrumentation_begin(); panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); } /* Mark backup_ghcb active before writing to it */ @@ -258,17 +267,24 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) { char buffer[MAX_INSN_SIZE]; - int res; + int insn_bytes; - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes == 0) { + /* Nothing could be copied */ ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; ctxt->fi.cr2 = ctxt->regs->ip; return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; } - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) return ES_DECODE_FAILED; if (ctxt->insn.immediate.got) @@ -479,11 +495,13 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +static noinstr void __sev_put_ghcb(struct ghcb_state *state) { struct sev_es_runtime_data *data; struct ghcb *ghcb; + WARN_ON(!irqs_disabled()); + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; @@ -507,7 +525,7 @@ void noinstr __sev_es_nmi_complete(void) struct ghcb_state state; struct ghcb *ghcb; - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); @@ -517,7 +535,7 @@ void noinstr __sev_es_nmi_complete(void) sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); VMGEXIT(); - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); } static u64 get_jump_table_addr(void) @@ -529,7 +547,7 @@ static u64 get_jump_table_addr(void) local_irq_save(flags); - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); @@ -543,7 +561,7 @@ static u64 get_jump_table_addr(void) ghcb_sw_exit_info_2_is_valid(ghcb)) ret = ghcb->save.sw_exit_info_2; - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); local_irq_restore(flags); @@ -668,7 +686,7 @@ static void sev_es_ap_hlt_loop(void) struct ghcb_state state; struct ghcb *ghcb; - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); while (true) { vc_ghcb_invalidate(ghcb); @@ -685,7 +703,7 @@ static void sev_es_ap_hlt_loop(void) break; } - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); } /* @@ -775,7 +793,7 @@ void __init sev_es_init_vc_handling(void) sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; + initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; } static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) @@ -1213,14 +1231,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, return ES_EXCEPTION; } -static __always_inline void vc_handle_trap_db(struct pt_regs *regs) -{ - if (user_mode(regs)) - noist_exc_debug(regs); - else - exc_debug(regs); -} - static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) @@ -1316,44 +1326,15 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); } -/* - * Main #VC exception handler. It is called when the entry code was able to - * switch off the IST to a safe kernel stack. - * - * With the current implementation it is always possible to switch to a safe - * stack because #VC exceptions only happen at known places, like intercepted - * instructions or accesses to MMIO areas/IO ports. They can also happen with - * code instrumentation when the hypervisor intercepts #DB, but the critical - * paths are forbidden to be instrumented, so #DB exceptions currently also - * only happen in safe places. - */ -DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) { - irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; + bool ret = true; - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { - vc_handle_trap_db(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - lockdep_assert_irqs_disabled(); - instrumentation_begin(); - - /* - * This is invoked through an interrupt gate, so IRQs are disabled. The - * code below might walk page-tables for user or kernel addresses, so - * keep the IRQs disabled to protect us against concurrent TLB flushes. - */ - - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); result = vc_init_em_ctxt(&ctxt, regs, error_code); @@ -1361,7 +1342,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) if (result == ES_OK) result = vc_handle_exitcode(&ctxt, ghcb, error_code); - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); /* Done - now check the result */ switch (result) { @@ -1369,17 +1350,20 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) vc_finish_insn(&ctxt); break; case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_VMM_ERROR: pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_DECODE_FAILED: pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_EXCEPTION: vc_forward_exception(&ctxt); break; @@ -1395,24 +1379,52 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) BUG(); } -out: - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); + return ret; +} - return; +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} -fail: - if (user_mode(regs)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } else { - pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", - result); +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(on_vc_fallback_stack(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { /* Show some debug info */ show_regs(regs); @@ -1423,23 +1435,38 @@ fail: panic("Returned from Terminate-Request to Hypervisor\n"); } - goto out; + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); } -/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ -DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) { + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); -} -DEFINE_IDTENTRY_VC(exc_vmm_communication) -{ - if (likely(!on_vc_fallback_stack(regs))) - safe_stack_exc_vmm_communication(regs, error_code); - else - ist_exc_vmm_communication(regs, error_code); + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); } bool __init handle_vc_boot_ghcb(struct pt_regs *regs) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index a06cb107c0e8..e12779a2714d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -713,7 +713,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: @@ -793,7 +793,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) } /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7770245cc7fa..9320285a5e29 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,11 +232,9 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - cpu_init_exception_handling(); - cpu_init(); + cpu_init_secondary(); rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); smp_callin(); enable_start_cpu0 = 0; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 853ea7a80806..ed540e09a399 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1160,12 +1160,9 @@ void __init trap_init(void) /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); + /* Initialize TSS before setting up traps so ISTs work */ + cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ idt_setup_traps(); - - /* - * Should be a barrier for any external CPU state: - */ cpu_init(); - - idt_setup_ist_traps(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57ec01192180..2e076a459a0c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs) static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -1152,7 +1153,8 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | - CLOCK_SOURCE_MUST_VERIFY, + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 8daa70b0d2da..576b47e7523d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -346,14 +346,12 @@ bool fixup_umip_exception(struct pt_regs *regs) if (!regs) return false; - nr_copied = insn_fetch_from_user(regs, buf); - /* - * The insn_fetch_from_user above could have failed if user code - * is protected by a memory protection key. Give up on emulation - * in such a case. Should we issue a page fault? + * Give up on emulation if fetching the instruction failed. Should a + * page fault or a #GP be issued? */ - if (!nr_copied) + nr_copied = insn_fetch_from_user(regs, buf); + if (nr_copied <= 0) return false; if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f6b93a35ce14..ac69894eab88 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,8 +22,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM depends on HIGH_RES_TIMERS - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET && MULTIUSER depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER @@ -36,8 +34,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO - select TASKSTATS - select TASK_DELAY_ACCT + select SCHED_INFO select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -46,6 +43,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select HAVE_KVM_PM_NOTIFIER if PM help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c589db5d91b3..75dfd27b6e8a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -11,13 +11,18 @@ KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ - $(KVM)/dirty_ring.o + $(KVM)/dirty_ring.o $(KVM)/binary_stats.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o + +ifdef CONFIG_HYPERV +kvm-y += kvm_onhyperv.o +endif + kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o @@ -27,6 +32,10 @@ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o +ifdef CONFIG_HYPERV +kvm-amd-y += svm/svm_onhyperv.o +endif + obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b4da665bb892..c42613cfb5ba 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -202,10 +202,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); /* - * Except for the MMU, which needs to be reset after any vendor - * specific adjustments to the reserved GPA bits. + * Except for the MMU, which needs to do its thing any vendor specific + * adjustments to the reserved GPA bits. */ - kvm_mmu_reset_context(vcpu); + kvm_mmu_after_set_cpuid(vcpu); } static int is_efer_nx(void) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 7e818d64bb4d..95a98413dc32 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -17,6 +17,15 @@ static int vcpu_get_timer_advance_ns(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n"); +static int vcpu_get_guest_mode(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->stat.guest_mode; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n"); + static int vcpu_get_tsc_offset(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; @@ -45,6 +54,8 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bi void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { + debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu, + &vcpu_guest_mode_fops); debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5e5de05a8fbf..2837110e66ed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,7 +22,6 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include <linux/stringify.h> -#include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/nospec-branch.h> @@ -1081,116 +1080,14 @@ static void fetch_register_operand(struct operand *op) } } -static void emulator_get_fpu(void) -{ - fpregs_lock(); - - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); -} - -static void emulator_put_fpu(void) -{ - fpregs_unlock(); -} - -static void read_sse_reg(sse128_t *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; -#endif - default: BUG(); - } - emulator_put_fpu(); -} - -static void write_sse_reg(sse128_t *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; -#endif - default: BUG(); - } - emulator_put_fpu(); -} - -static void read_mmx_reg(u64 *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; - case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; - case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; - case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; - case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; - case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; - case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; - case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; - default: BUG(); - } - emulator_put_fpu(); -} - -static void write_mmx_reg(u64 *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; - case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; - case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; - case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; - case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; - case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; - case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; - case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; - default: BUG(); - } - emulator_put_fpu(); -} - static int em_fninit(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fninit"); - emulator_put_fpu(); + kvm_fpu_put(); return X86EMUL_CONTINUE; } @@ -1201,9 +1098,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fnstcw %0": "+m"(fcw)); - emulator_put_fpu(); + kvm_fpu_put(); ctxt->dst.val = fcw; @@ -1217,9 +1114,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fnstsw %0": "+m"(fsw)); - emulator_put_fpu(); + kvm_fpu_put(); ctxt->dst.val = fsw; @@ -1238,7 +1135,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; - read_sse_reg(&op->vec_val, reg); + kvm_read_sse_reg(reg, &op->vec_val); return; } if (ctxt->d & Mmx) { @@ -1289,7 +1186,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(&op->vec_val, ctxt->modrm_rm); + kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); return rc; } if (ctxt->d & Mmx) { @@ -1866,10 +1763,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->bytes * op->count); break; case OP_XMM: - write_sse_reg(&op->vec_val, op->addr.xmm); + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; case OP_MM: - write_mmx_reg(&op->mm_val, op->addr.mm); + kvm_write_mmx_reg(op->addr.mm, &op->mm_val); break; case OP_NONE: /* no writeback */ @@ -2638,8 +2535,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & - ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + ctxt->ops->exiting_smm(ctxt); /* * Get back to real mode, to prepare a safe state in which to load @@ -2678,12 +2574,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) } /* - * Give pre_leave_smm() a chance to make ISA-specific changes to the - * vCPU state (e.g. enter guest mode) before loading state from the SMM + * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (ctxt->ops->pre_leave_smm(ctxt, buf)) - return X86EMUL_UNHANDLEABLE; + if (ctxt->ops->leave_smm(ctxt, buf)) + goto emulate_shutdown; #ifdef CONFIG_X86_64 if (emulator_has_longmode(ctxt)) @@ -2692,13 +2588,21 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) #endif ret = rsm_load_state_32(ctxt, buf); - if (ret != X86EMUL_CONTINUE) { - /* FIXME: should triple fault */ - return X86EMUL_UNHANDLEABLE; - } + if (ret != X86EMUL_CONTINUE) + goto emulate_shutdown; - ctxt->ops->post_leave_smm(ctxt); + /* + * Note, the ctxt->ops callbacks are responsible for handling side + * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID + * runtime updates, etc... If that changes, e.g. this flow is moved + * out of the emulator to make it look more like enter_smm(), then + * those side effects need to be explicitly handled for both success + * and shutdown. + */ + return X86EMUL_CONTINUE; +emulate_shutdown: + ctxt->ops->triple_fault(ctxt); return X86EMUL_CONTINUE; } @@ -4124,11 +4028,11 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - emulator_get_fpu(); + kvm_fpu_get(); rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); - emulator_put_fpu(); + kvm_fpu_put(); if (rc != X86EMUL_CONTINUE) return rc; @@ -4172,7 +4076,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - emulator_get_fpu(); + kvm_fpu_get(); if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); @@ -4189,7 +4093,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: - emulator_put_fpu(); + kvm_fpu_put(); return rc; } @@ -5437,9 +5341,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; - emulator_get_fpu(); + kvm_fpu_get(); rc = asm_safe("fwait"); - emulator_put_fpu(); + kvm_fpu_put(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); @@ -5450,7 +5354,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) - read_mmx_reg(&op->mm_val, op->addr.mm); + kvm_read_mmx_reg(op->addr.mm, &op->mm_val); } static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h new file mode 100644 index 000000000000..3ba12888bf66 --- /dev/null +++ b/arch/x86/kvm/fpu.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KVM_FPU_H_ +#define __KVM_FPU_H_ + +#include <asm/fpu/api.h> + +typedef u32 __attribute__((vector_size(16))) sse128_t; +#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } +#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) +#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; }) +#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; }) +#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; }) +#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; }) +#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) +#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) + +static inline void _kvm_read_sse_reg(int reg, sse128_t *data) +{ + switch (reg) { + case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_write_sse_reg(int reg, const sse128_t *data) +{ + switch (reg) { + case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_read_mmx_reg(int reg, u64 *data) +{ + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } +} + +static inline void _kvm_write_mmx_reg(int reg, const u64 *data) +{ + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } +} + +static inline void kvm_fpu_get(void) +{ + fpregs_lock(); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); +} + +static inline void kvm_fpu_put(void) +{ + fpregs_unlock(); +} + +static inline void kvm_read_sse_reg(int reg, sse128_t *data) +{ + kvm_fpu_get(); + _kvm_read_sse_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_sse_reg(int reg, const sse128_t *data) +{ + kvm_fpu_get(); + _kvm_write_sse_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_read_mmx_reg(int reg, u64 *data) +{ + kvm_fpu_get(); + _kvm_read_mmx_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_mmx_reg(int reg, const u64 *data) +{ + kvm_fpu_get(); + _kvm_write_mmx_reg(reg, data); + kvm_fpu_put(); +} + +#endif diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f00830e5202f..b07592ca92f0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -36,6 +36,7 @@ #include "trace.h" #include "irq.h" +#include "fpu.h" /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -273,15 +274,10 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *entry; - - entry = kvm_find_cpuid_entry(vcpu, - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, - 0); - if (!entry) - return false; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + return hv_vcpu->cpuid_cache.syndbg_cap_eax & + HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) @@ -635,11 +631,17 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; + if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && + !(hv_vcpu->cpuid_cache.features_edx & + HV_STIMER_DIRECT_MODE_AVAILABLE))) + return 1; + trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -1206,12 +1208,90 @@ out_unlock: mutex_unlock(&hv->hv_lock); } + +static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) +{ + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_HYPERCALL_AVAILABLE; + case HV_X64_MSR_VP_RUNTIME: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_RUNTIME_AVAILABLE; + case HV_X64_MSR_TIME_REF_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_TIME_REF_COUNT_AVAILABLE; + case HV_X64_MSR_VP_INDEX: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_INDEX_AVAILABLE; + case HV_X64_MSR_RESET: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_RESET_AVAILABLE; + case HV_X64_MSR_REFERENCE_TSC: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_REFERENCE_TSC_AVAILABLE; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNIC_AVAILABLE; + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNTIMER_AVAILABLE; + case HV_X64_MSR_EOI: + case HV_X64_MSR_ICR: + case HV_X64_MSR_TPR: + case HV_X64_MSR_VP_ASSIST_PAGE: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_APIC_ACCESS_AVAILABLE; + break; + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_FREQUENCY_MSRS; + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_DEBUG_MSRS_AVAILABLE; + default: + break; + } + + return false; +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; + switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; @@ -1340,6 +1420,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; + switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); @@ -1454,6 +1537,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; + switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; @@ -1503,6 +1589,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; + switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; @@ -1631,8 +1720,22 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( return vcpu_bitmap; } -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex) +struct kvm_hv_hcall { + u64 param; + u64 ingpa; + u64 outgpa; + u16 code; + u16 rep_cnt; + u16 rep_idx; + bool fast; + bool rep; + sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; +}; + +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { + int i; + gpa_t gpa; struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; @@ -1646,8 +1749,15 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool bool all_cpus; if (!ex) { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + flush.address_space = hc->ingpa; + flush.flags = hc->outgpa; + flush.processor_mask = sse128_lo(hc->xmm[0]); + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, + &flush, sizeof(flush)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags); @@ -1665,9 +1775,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, - sizeof(flush_ex)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + flush_ex.address_space = hc->ingpa; + flush_ex.flags = hc->outgpa; + memcpy(&flush_ex.hv_vp_set, + &hc->xmm[0], sizeof(hc->xmm[0])); + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, + sizeof(flush_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, @@ -1678,20 +1795,28 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = - bitmap_weight((unsigned long *)&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); + sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64); if (!sparse_banks_len && !all_cpus) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, - ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (!all_cpus) { + if (hc->fast) { + if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + for (i = 0; i < sparse_banks_len; i += 2) { + sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); + sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); + } + } else { + gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents); + if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks, + sparse_banks_len * + sizeof(sparse_banks[0])))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + } } cpumask_clear(&hv_vcpu->tlb_flush); @@ -1704,13 +1829,13 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: - /* We always do full TLB flush, set rep_done = rep_cnt. */ + /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | - ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); + ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, @@ -1732,8 +1857,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, - bool ex, bool fast) +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; @@ -1748,25 +1872,25 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, bool all_cpus; if (!ex) { - if (!fast) { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + if (!hc->fast) { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ - if (unlikely(ingpa >> 32 != 0)) + if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - sparse_banks[0] = outgpa; - vector = (u32)ingpa; + sparse_banks[0] = hc->outgpa; + vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1786,8 +1910,8 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, if (!all_cpus && kvm_read_guest(kvm, - ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), + hc->ingpa + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents), sparse_banks, sparse_banks_len)) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1809,12 +1933,67 @@ ret_success: void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); - if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) + if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; - else + } else { vcpu->arch.hyperv_enabled = false; + return; + } + + if (!to_hv_vcpu(vcpu) && kvm_hv_vcpu_init(vcpu)) + return; + + hv_vcpu = to_hv_vcpu(vcpu); + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + if (entry) { + hv_vcpu->cpuid_cache.features_eax = entry->eax; + hv_vcpu->cpuid_cache.features_ebx = entry->ebx; + hv_vcpu->cpuid_cache.features_edx = entry->edx; + } else { + hv_vcpu->cpuid_cache.features_eax = 0; + hv_vcpu->cpuid_cache.features_ebx = 0; + hv_vcpu->cpuid_cache.features_edx = 0; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + if (entry) { + hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; + hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; + } else { + hv_vcpu->cpuid_cache.enlightenments_eax = 0; + hv_vcpu->cpuid_cache.enlightenments_ebx = 0; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + if (entry) + hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; + else + hv_vcpu->cpuid_cache.syndbg_cap_eax = 0; +} + +int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) +{ + struct kvm_vcpu_hv *hv_vcpu; + int ret = 0; + + if (!to_hv_vcpu(vcpu)) { + if (enforce) { + ret = kvm_hv_vcpu_init(vcpu); + if (ret) + return ret; + } else { + return 0; + } + } + + hv_vcpu = to_hv_vcpu(vcpu); + hv_vcpu->enforce_cpuid = enforce; + + return ret; } bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) @@ -1847,20 +2026,21 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } -static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) +static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; - if (unlikely(!fast)) { + if (unlikely(!hc->fast)) { int ret; - gpa_t gpa = param; + gpa_t gpa = hc->ingpa; - if ((gpa & (__alignof__(param) - 1)) || - offset_in_page(gpa) + sizeof(param) > PAGE_SIZE) + if ((gpa & (__alignof__(hc->ingpa) - 1)) || + offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; - ret = kvm_vcpu_read_guest(vcpu, gpa, ¶m, sizeof(param)); + ret = kvm_vcpu_read_guest(vcpu, gpa, + &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } @@ -1870,15 +2050,15 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ - if (param & 0xffff00000000ULL) + if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ - if (param & ~KVM_HYPERV_CONN_ID_MASK) + if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); - eventfd = idr_find(&hv->conn_to_evt, param); + eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; @@ -1887,11 +2067,80 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) return HV_STATUS_SUCCESS; } +static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) +{ + switch (hc->code) { + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + return true; + } + + return false; +} + +static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) +{ + int reg; + + kvm_fpu_get(); + for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) + _kvm_read_sse_reg(reg, &hc->xmm[reg]); + kvm_fpu_put(); +} + +static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) +{ + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (code) { + case HVCALL_NOTIFY_LONG_SPIN_WAIT: + return hv_vcpu->cpuid_cache.enlightenments_ebx && + hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; + case HVCALL_POST_MESSAGE: + return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; + case HVCALL_SIGNAL_EVENT: + return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + case HVCALL_RESET_DEBUG_SESSION: + /* + * Return 'true' when SynDBG is disabled so the resulting code + * will be HV_STATUS_INVALID_HYPERCALL_CODE. + */ + return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || + hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + case HVCALL_SEND_IPI_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; + case HVCALL_SEND_IPI: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_CLUSTER_IPI_RECOMMENDED; + default: + break; + } + + return true; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { - u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; - uint16_t code, rep_idx, rep_cnt; - bool fast, rep; + struct kvm_hv_hcall hc; + u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode @@ -1904,104 +2153,113 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (is_64_bit_mode(vcpu)) { - param = kvm_rcx_read(vcpu); - ingpa = kvm_rdx_read(vcpu); - outgpa = kvm_r8_read(vcpu); + hc.param = kvm_rcx_read(vcpu); + hc.ingpa = kvm_rdx_read(vcpu); + hc.outgpa = kvm_r8_read(vcpu); } else #endif { - param = ((u64)kvm_rdx_read(vcpu) << 32) | - (kvm_rax_read(vcpu) & 0xffffffff); - ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | - (kvm_rcx_read(vcpu) & 0xffffffff); - outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | - (kvm_rsi_read(vcpu) & 0xffffffff); + hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | + (kvm_rax_read(vcpu) & 0xffffffff); + hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | + (kvm_rcx_read(vcpu) & 0xffffffff); + hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | + (kvm_rsi_read(vcpu) & 0xffffffff); } - code = param & 0xffff; - fast = !!(param & HV_HYPERCALL_FAST_BIT); - rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; - rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; - rep = !!(rep_cnt || rep_idx); + hc.code = hc.param & 0xffff; + hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); + hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; + hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; + hc.rep = !!(hc.rep_cnt || hc.rep_idx); - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + if (hc.fast && is_xmm_fast_hypercall(&hc)) + kvm_hv_hypercall_read_xmm(&hc); - switch (code) { + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, + hc.ingpa, hc.outgpa); + + if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) { + ret = HV_STATUS_ACCESS_DENIED; + goto hypercall_complete; + } + + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); + ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(rep || !to_hv_synic(vcpu)->active)) { + if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(fast || !rep_cnt || rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(fast || rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: - if (unlikely(fast || !rep_cnt || rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_SEND_IPI: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); + ret = kvm_hv_send_ipi(vcpu, &hc, false); break; case HVCALL_SEND_IPI_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.fast || hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + ret = kvm_hv_send_ipi(vcpu, &hc, true); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: - if (unlikely(fast)) { + if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } @@ -2020,9 +2278,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; @@ -2032,6 +2290,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; } +hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); } @@ -2180,6 +2439,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 60547d5cb6d7..730da8537d05 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -138,6 +138,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu); +int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3db5c42c9ecd..90e1ffdc05b7 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -55,6 +55,13 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } +static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -118,6 +125,11 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } +static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value) +{ + vcpu->arch.walk_mmu->pdptrs[index] = value; +} + static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; @@ -162,6 +174,7 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; + vcpu->stat.guest_mode = 1; } static inline void leave_guest_mode(struct kvm_vcpu *vcpu) @@ -172,6 +185,8 @@ static inline void leave_guest_mode(struct kvm_vcpu *vcpu) vcpu->arch.load_eoi_exitmap_pending = false; kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); } + + vcpu->stat.guest_mode = 0; } static inline bool is_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 3e870bf9ca4d..68b420289d7e 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -13,6 +13,7 @@ #define _ASM_X86_KVM_X86_EMULATE_H #include <asm/desc_defs.h> +#include "fpu.h" struct x86_emulate_ctxt; enum x86_intercept; @@ -229,15 +230,12 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); - void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, - const char *smstate); - void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); + void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); + int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); + void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; -typedef u32 __attribute__((vector_size(16))) sse128_t; - /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c new file mode 100644 index 000000000000..c7db2df50a7a --- /dev/null +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM L1 hypervisor optimizations on Hyper-V. + */ + +#include <linux/kvm_host.h> +#include <asm/mshyperv.h> + +#include "hyperv.h" +#include "kvm_onhyperv.h" + +static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, + void *data) +{ + struct kvm_tlb_range *range = data; + + return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, + range->pages); +} + +static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, + struct kvm_tlb_range *range) +{ + if (range) + return hyperv_flush_guest_mapping_range(root_tdp, + kvm_fill_hv_flush_list_func, (void *)range); + else + return hyperv_flush_guest_mapping(root_tdp); +} + +int hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_tlb_range *range) +{ + struct kvm_arch *kvm_arch = &kvm->arch; + struct kvm_vcpu *vcpu; + int ret = 0, i, nr_unique_valid_roots; + hpa_t root; + + spin_lock(&kvm_arch->hv_root_tdp_lock); + + if (!VALID_PAGE(kvm_arch->hv_root_tdp)) { + nr_unique_valid_roots = 0; + + /* + * Flush all valid roots, and see if all vCPUs have converged + * on a common root, in which case future flushes can skip the + * loop and flush the common root. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + root = vcpu->arch.hv_root_tdp; + if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp) + continue; + + /* + * Set the tracked root to the first valid root. Keep + * this root for the entirety of the loop even if more + * roots are encountered as a low effort optimization + * to avoid flushing the same (first) root again. + */ + if (++nr_unique_valid_roots == 1) + kvm_arch->hv_root_tdp = root; + + if (!ret) + ret = hv_remote_flush_root_tdp(root, range); + + /* + * Stop processing roots if a failure occurred and + * multiple valid roots have already been detected. + */ + if (ret && nr_unique_valid_roots > 1) + break; + } + + /* + * The optimized flush of a single root can't be used if there + * are multiple valid roots (obviously). + */ + if (nr_unique_valid_roots > 1) + kvm_arch->hv_root_tdp = INVALID_PAGE; + } else { + ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range); + } + + spin_unlock(&kvm_arch->hv_root_tdp_lock); + return ret; +} +EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range); + +int hv_remote_flush_tlb(struct kvm *kvm) +{ + return hv_remote_flush_tlb_with_range(kvm, NULL); +} +EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h new file mode 100644 index 000000000000..1c67abf2eba9 --- /dev/null +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM L1 hypervisor optimizations on Hyper-V. + */ + +#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__ +#define __ARCH_X86_KVM_KVM_ONHYPERV_H__ + +#if IS_ENABLED(CONFIG_HYPERV) +int hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_tlb_range *range); +int hv_remote_flush_tlb(struct kvm *kvm); + +static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ + struct kvm_arch *kvm_arch = &vcpu->kvm->arch; + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_arch->hv_root_tdp_lock); + vcpu->arch.hv_root_tdp = root_tdp; + if (root_tdp != kvm_arch->hv_root_tdp) + kvm_arch->hv_root_tdp = INVALID_PAGE; + spin_unlock(&kvm_arch->hv_root_tdp_lock); + } +} +#else /* !CONFIG_HYPERV */ +static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ +} +#endif /* !CONFIG_HYPERV */ + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 17fa4ab1b834..ba5a27879f1d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2631,6 +2631,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); __start_apic_timer(apic, APIC_TMCCT); + kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { @@ -2872,7 +2873,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); } -void kvm_apic_accept_events(struct kvm_vcpu *vcpu) +int kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; @@ -2880,7 +2881,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) unsigned long pe; if (!lapic_in_kernel(vcpu)) - return; + return 0; /* * Read pending events before calling the check_events @@ -2888,12 +2889,12 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) */ pe = smp_load_acquire(&apic->pending_events); if (!pe) - return; + return 0; if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) - return; + return r == -EBUSY ? 0 : r; /* * If an event has happened and caused a vmexit, * we know INITs are latched and therefore @@ -2914,7 +2915,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &pe)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); - return; + return 0; } if (test_bit(KVM_APIC_INIT, &pe)) { @@ -2935,6 +2936,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } + return 0; } void kvm_lapic_exit(void) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 997c45a5963a..d7c25d0c1354 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -76,7 +76,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -void kvm_apic_accept_events(struct kvm_vcpu *vcpu); +int kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 88d0ed5225a4..83e6c6965f1e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,6 +44,12 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \ + X86_CR4_LA57) + +#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) + static __always_inline u64 rsvd_bits(int s, int e) { BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); @@ -62,12 +68,9 @@ static __always_inline u64 rsvd_bits(int s, int e) void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); -void -reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); - -void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, - gpa_t nested_cr3); +void kvm_init_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer, gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); @@ -162,11 +165,6 @@ static inline bool is_writable_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static inline bool is_write_protection(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_WP); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE @@ -232,4 +230,14 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + /* + * Read memslot_have_rmaps before rmap pointers. Hence, threads reading + * memslots_have_rmaps in any lock context are guaranteed to see the + * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + */ + return smp_load_acquire(&kvm->arch.memslots_have_rmaps); +} + #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d5876dfc6b7..845d114ae075 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -55,7 +55,7 @@ extern bool itlb_multihit_kvm_mitigation; -static int __read_mostly nx_huge_pages = -1; +int __read_mostly nx_huge_pages = -1; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -176,9 +176,80 @@ static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +struct kvm_mmu_role_regs { + const unsigned long cr0; + const unsigned long cr4; + const u64 efer; +}; + #define CREATE_TRACE_POINTS #include "mmutrace.h" +/* + * Yes, lot's of underscores. They're a hint that you probably shouldn't be + * reading from the role_regs. Once the mmu_role is constructed, it becomes + * the single source of truth for the MMU's state. + */ +#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ +static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +{ \ + return !!(regs->reg & flag); \ +} +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); + +/* + * The MMU itself (with a valid role) is the single source of truth for the + * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The + * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, + * and the vCPU may be incorrect/irrelevant. + */ +#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ +static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +{ \ + return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ +} +BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); +BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); +BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); + +static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_role_regs regs = { + .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), + .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), + .efer = vcpu->arch.efer, + }; + + return regs; +} + +static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) +{ + if (!____is_cr0_pg(regs)) + return 0; + else if (____is_efer_lma(regs)) + return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (____is_cr4_pae(regs)) + return PT32E_ROOT_LEVEL; + else + return PT32_ROOT_LEVEL; +} static inline bool kvm_available_flush_tlb_with_range(void) { @@ -208,11 +279,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -269,11 +335,6 @@ static int is_cpuid_PSE36(void) return 1; } -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.efer & EFER_NX; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -1177,8 +1238,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, * @gfn_offset: start of the BITS_PER_LONG pages we care about * @mask: indicates which pages we should protect * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * Used when we do not need to care about huge page mappings. */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1189,6 +1249,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1218,6 +1282,10 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1235,13 +1303,36 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to * enable dirty logging for them. * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * We need to care about huge page mappings: e.g. during dirty logging we may + * have such mappings. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { + /* + * Huge pages are NOT write protected when we start dirty logging in + * initially-all-set mode; must write protect them here so that they + * are split to 4K on the first write. + * + * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn + * of memslot has no such restriction, so the range can cross two large + * pages. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { + gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); + gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); + + /* Cross two large pages? */ + if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != + ALIGN(end << PAGE_SHIFT, PMD_SIZE)) + kvm_mmu_slot_gfn_write_protect(kvm, slot, end, + PG_LEVEL_2M); + } + + /* Now handle 4K PTEs. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else @@ -1254,20 +1345,23 @@ int kvm_cpu_dirty_log_size(void) } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn) + struct kvm_memory_slot *slot, u64 gfn, + int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } } if (is_tdp_mmu_enabled(kvm)) write_protected |= - kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } @@ -1277,7 +1371,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1433,9 +1527,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1445,9 +1540,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1500,9 +1596,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1512,9 +1609,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1748,17 +1846,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else -static inline bool is_ept_sp(struct kvm_mmu_page *sp) -{ - return sp->role.cr0_wp && sp->role.smap_andnot_wp; -} - -/* @sp->gfn should be write-protected at the call site */ -static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { - if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) || - vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -1804,31 +1895,6 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } -static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - kvm_unlink_unsync_page(vcpu->kvm, sp); - return __kvm_sync_page(vcpu, sp, invalid_list); -} - -/* @gfn should be write-protected at the call site */ -static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *s; - bool ret = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (!s->unsync) - continue; - - WARN_ON(s->role.level != PG_LEVEL_4K); - ret |= kvm_sync_page(vcpu, s, invalid_list); - } - - return ret; -} - struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; @@ -1923,6 +1989,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } for_each_sp(pages, sp, parents, i) { + kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } @@ -1958,8 +2025,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; - bool need_sync = false; - bool flush = false; int collisions = 0; LIST_HEAD(invalid_list); @@ -1982,20 +2047,39 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, continue; } - if (!need_sync && sp->unsync) - need_sync = true; - - if (sp->role.word != role.word) + if (sp->role.word != role.word) { + /* + * If the guest is creating an upper-level page, zap + * unsync pages for the same gfn. While it's possible + * the guest is using recursive page tables, in all + * likelihood the guest has stopped using the unsync + * page and is installing a completely unrelated page. + * Unsync pages must not be left as is, because the new + * upper-level page will be write-protected. + */ + if (level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); continue; + } if (direct_mmu) goto trace_get_page; if (sp->unsync) { - /* The page is good, but __kvm_sync_page might still end - * up zapping it. If so, break in order to rebuild it. + /* + * The page is good, but is stale. kvm_sync_page does + * get the latest guest state, but (unlike mmu_unsync_children) + * it doesn't write-protect the page or mark it synchronized! + * This way the validity of the mapping is ensured, but the + * overhead of write protection is not incurred until the + * guest invalidates the TLB mapping. This allows multiple + * SPs for a single gfn to be unsync. + * + * If the sync fails, the page is zapped. If so, break + * in order to rebuild it. */ - if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + if (!kvm_sync_page(vcpu, sp, &invalid_list)) break; WARN_ON(!list_empty(&invalid_list)); @@ -2020,22 +2104,14 @@ trace_get_page: sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (!direct) { - /* - * we should do write protection before syncing pages - * otherwise the content of the synced shadow page may - * be inconsistent with guest page table. - */ account_shadowed(vcpu->kvm, sp); if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - - if (level > PG_LEVEL_4K && need_sync) - flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } trace_kvm_mmu_get_page(sp, true); - - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); out: + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; @@ -2374,7 +2450,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, - * being too agressive may unnecessarily kill the guest, and getting an + * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths. */ @@ -2448,17 +2524,33 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +/* + * Attempt to unsync any shadow pages that can be reached by the specified gfn, + * KVM is creating a writable mapping for said gfn. Returns 0 if all pages + * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must + * be write-protected. + */ +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + /* + * Force write-protection if the page is being tracked. Note, the page + * track machinery is used to write-protect upper-level shadow pages, + * i.e. this guards the role.level == 4K assertion below! + */ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) - return true; + return -EPERM; + /* + * The page is not write-tracked, mark existing shadow pages unsync + * unless KVM is synchronizing an unsync SP (can_unsync = false). In + * that case, KVM must complete emulation of the guest TLB flush before + * allowing shadow pages to become unsync (writable by the guest). + */ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return true; + return -EPERM; if (sp->unsync) continue; @@ -2489,8 +2581,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, * 2.2 Guest issues TLB flush. * That causes a VM Exit. * - * 2.3 kvm_mmu_sync_pages() reads sp->unsync. - * Since it is false, so it just returns. + * 2.3 Walking of unsync pages sees sp->unsync is + * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, @@ -2506,7 +2598,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, */ smp_wmb(); - return false; + return 0; } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2827,9 +2919,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -3180,6 +3269,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); +void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned long roots_to_free = 0; + hpa_t root_hpa; + int i; + + /* + * This should not be called while L2 is active, L2 can't invalidate + * _only_ its own roots, e.g. INVVPID unconditionally exits. + */ + WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + root_hpa = mmu->prev_roots[i].hpa; + if (!VALID_PAGE(root_hpa)) + continue; + + if (!to_shadow_page(root_hpa) || + to_shadow_page(root_hpa)->role.guest_mode) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); +} +EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); + + static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; @@ -3280,6 +3396,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } + r = alloc_all_memslots_rmaps(vcpu->kvm); + if (r) + return r; + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) @@ -3423,8 +3543,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) * flush strictly after those changes are made. We only need to * ensure that the other CPU sets these flags before any actual * changes to the page tables are made. The comments in - * mmu_need_write_protect() describe what could go wrong if this - * requirement isn't satisfied. + * mmu_try_to_unsync_pages() describe what could go wrong if + * this requirement isn't satisfied. */ if (!smp_load_acquire(&sp->unsync) && !smp_load_acquire(&sp->unsync_children)) @@ -3474,19 +3594,6 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } -static bool -__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) -{ - int bit7 = (pte >> 7) & 1; - - return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; -} - -static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) -{ - return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); -} - static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* @@ -3540,12 +3647,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { - *sptep = 0ull; - return reserved; - } - - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3569,13 +3671,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the - * reserved bit and EPT's invalid memtype/XWR checks to avoid - * adding a Jcc in the loop. - */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | - __is_rsvd_bits_set(rsvd_check, sptes[level], level); + reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", @@ -3583,7 +3679,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, - rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); + get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; @@ -3717,6 +3813,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3729,7 +3826,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + if (!is_tdp_mmu_fault) { r = fast_page_fault(vcpu, gpa, error_code); if (r != RET_PF_INVALID) return r; @@ -3751,7 +3848,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = RET_PF_RETRY; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_lock(&vcpu->kvm->mmu_lock); else write_lock(&vcpu->kvm->mmu_lock); @@ -3762,7 +3859,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) goto out_unlock; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, pfn, prefault); else @@ -3770,7 +3867,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); @@ -3840,17 +3937,13 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, max_level, true); } -static void nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; - context->nx = false; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -3913,8 +4006,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, } static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role, - bool skip_tlb_flush, bool skip_mmu_sync) + union kvm_mmu_page_role new_role) { if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); @@ -3929,10 +4021,10 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_mmu_sync || force_flush_and_sync_on_reuse) + if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - if (!skip_tlb_flush || force_flush_and_sync_on_reuse) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When @@ -3951,11 +4043,9 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, to_shadow_page(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, - bool skip_mmu_sync) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { - __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush, skip_mmu_sync); + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -3981,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, - unsigned level, unsigned gpte) -{ - /* - * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. - * If it is clear, there are no large pages at this level, so clear - * PT_PAGE_SIZE_MASK in gpte if that is the case. - */ - gpte &= level - mmu->last_nonleaf_level; - - /* - * PG_LEVEL_4K always terminates. The RHS has bit 7 set - * iff level <= PG_LEVEL_4K, which for our purpose means - * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PG_LEVEL_4K - 1; - - return gpte & PT_PAGE_SIZE_MASK; -} - #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" @@ -4015,8 +4085,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, #undef PTTYPE static void -__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct rsvd_bits_validate *rsvd_check, +__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { @@ -4105,14 +4174,29 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } +static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) +{ + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + */ + return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); +} + static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, - context->root_level, context->nx, - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), + context->root_level, is_efer_nx(context), + guest_can_use_gbpages(vcpu), + is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4165,24 +4249,32 @@ static inline u64 reserved_hpa_bits(void) * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ -void -reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - bool uses_nx = context->nx || - context->mmu_role.base.smep_andnot_wp; + /* + * KVM uses NX when TDP is disabled to handle a variety of scenarios, + * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and + * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. + * The iTLB multi-hit workaround can be toggled at any time, so assume + * NX can be used by any non-nested shadow MMU to avoid having to reset + * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. + */ + bool uses_nx = is_efer_nx(context) || !tdp_enabled; + + /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ + bool is_amd = true; + /* KVM doesn't use 2-level page tables for the shadow MMU. */ + bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; - /* - * Passing "true" to the last argument is okay; it adds a check - * on bit 8 of the SPTEs which KVM doesn't use anyway. - */ + WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); + shadow_zero_check = &context->shadow_zero_check; - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, uses_nx, - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), true); + guest_can_use_gbpages(vcpu), is_pse, is_amd); if (!shadow_me_mask) return; @@ -4193,7 +4285,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } } -EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); static inline bool boot_cpu_is_amd(void) { @@ -4215,11 +4306,10 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), - true, true); + false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false); @@ -4255,8 +4345,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, (7 & (access) ? 128 : 0)) -static void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; @@ -4264,9 +4353,10 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); - bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0; - bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0; - bool cr0_wp = is_write_protection(vcpu); + bool cr4_smep = is_cr4_smep(mmu); + bool cr4_smap = is_cr4_smap(mmu); + bool cr0_wp = is_cr0_wp(mmu); + bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; @@ -4292,7 +4382,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ - if (!mmu->nx) + if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ @@ -4351,24 +4441,17 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ -static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept) +static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; - if (ept) { - mmu->pkru_mask = 0; - return; - } - - /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + if (!is_cr4_pke(mmu)) { mmu->pkru_mask = 0; return; } - wp = is_write_protection(vcpu); + wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; @@ -4402,81 +4485,51 @@ static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } } -static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) { - unsigned root_level = mmu->root_level; + if (!is_cr0_pg(mmu)) + return; - mmu->last_nonleaf_level = root_level; - if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) - mmu->last_nonleaf_level++; + reset_rsvds_bits_mask(vcpu, mmu); + update_permission_bitmask(mmu, false); + update_pkru_bitmask(mmu); } -static void paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void paging64_init_context(struct kvm_mmu *context) { - context->nx = is_nx(vcpu); - context->root_level = level; - - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - - MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->shadow_root_level = level; context->direct_map = false; } -static void paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - int root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - - paging64_init_context_common(vcpu, context, root_level); -} - -static void paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_mmu *context) { - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } -static void paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); -} - -static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) +static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_extended_role ext = {0}; - ext.cr0_pg = !!is_paging(vcpu); - ext.cr4_pae = !!is_pae(vcpu); - ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - ext.cr4_pse = !!is_pse(vcpu); - ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); - ext.maxphyaddr = cpuid_maxphyaddr(vcpu); + if (____is_cr0_pg(regs)) { + ext.cr0_pg = 1; + ext.cr4_pae = ____is_cr4_pae(regs); + ext.cr4_smep = ____is_cr4_smep(regs); + ext.cr4_smap = ____is_cr4_smap(regs); + ext.cr4_pse = ____is_cr4_pse(regs); + + /* PKEY and LA57 are active iff long mode is active. */ + ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + } ext.valid = 1; @@ -4484,20 +4537,23 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) } static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = {0}; role.base.access = ACC_ALL; - role.base.nxe = !!is_nx(vcpu); - role.base.cr0_wp = is_write_protection(vcpu); + if (____is_cr0_pg(regs)) { + role.base.efer_nx = ____is_efer_nx(regs); + role.base.cr0_wp = ____is_cr0_wp(regs); + } role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); if (base_only) return role; - role.ext = kvm_calc_mmu_role_ext(vcpu); + role.ext = kvm_calc_mmu_role_ext(vcpu, regs); return role; } @@ -4512,9 +4568,10 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4527,8 +4584,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role new_role = - kvm_calc_tdp_mmu_root_page_role(vcpu, false); + kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -4542,60 +4600,44 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; + context->root_level = role_regs_to_root_level(®s); - if (!is_paging(vcpu)) { - context->nx = false; + if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; - context->root_level = 0; - } else if (is_long_mode(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - reset_rsvds_bits_mask(vcpu, context); - context->gva_to_gpa = paging64_gva_to_gpa; - } else if (is_pae(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); + else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; - } else { - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); + else context->gva_to_gpa = paging32_gva_to_gpa; - } - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); + reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } static union kvm_mmu_role -kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); - role.base.smep_andnot_wp = role.ext.cr4_smep && - !is_write_protection(vcpu); - role.base.smap_andnot_wp = role.ext.cr4_smap && - !is_write_protection(vcpu); - role.base.gpte_is_8_bytes = !!is_pae(vcpu); + role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); + role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); + role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); return role; } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, base_only); + kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); - role.base.direct = !is_paging(vcpu); + role.base.direct = !____is_cr0_pg(regs); - if (!is_long_mode(vcpu)) + if (!____is_efer_lma(regs)) role.base.level = PT32E_ROOT_LEVEL; - else if (is_la57_mode(vcpu)) + else if (____is_cr4_la57(regs)) role.base.level = PT64_ROOT_5LEVEL; else role.base.level = PT64_ROOT_4LEVEL; @@ -4604,37 +4646,44 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - u32 cr0, u32 cr4, u32 efer, + struct kvm_mmu_role_regs *regs, union kvm_mmu_role new_role) { - if (!(cr0 & X86_CR0_PG)) - nonpaging_init_context(vcpu, context); - else if (efer & EFER_LMA) - paging64_init_context(vcpu, context); - else if (cr4 & X86_CR4_PAE) - paging32E_init_context(vcpu, context); - else - paging32_init_context(vcpu, context); + if (new_role.as_u64 == context->mmu_role.as_u64) + return; context->mmu_role.as_u64 = new_role.as_u64; + + if (!is_cr0_pg(context)) + nonpaging_init_context(context); + else if (is_cr4_pae(context)) + paging64_init_context(context); + else + paging32_init_context(context); + context->root_level = role_regs_to_root_level(regs); + + reset_guest_paging_metadata(vcpu, context); + context->shadow_root_level = new_role.base.level; + reset_shadow_zero_bits_mask(vcpu, context); } -static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); + kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); - if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, regs, new_role); } static union kvm_mmu_role -kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, false); + kvm_calc_shadow_root_page_role_common(vcpu, regs, false); role.base.direct = false; role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4642,23 +4691,22 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) return role; } -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, - gpa_t nested_cr3) +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; - union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + struct kvm_mmu_role_regs regs = { + .cr0 = cr0, + .cr4 = cr4, + .efer = efer, + }; + union kvm_mmu_role new_role; - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); - if (new_role.as_u64 != context->mmu_role.as_u64) { - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); - /* - * Override the level set by the common init helper, nested TDP - * always uses the host's TDP configuration. - */ - context->shadow_root_level = new_role.base.level; - } + shadow_mmu_init_context(vcpu, context, ®s, new_role); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4678,15 +4726,10 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.guest_mode = true; role.base.access = ACC_ALL; - /* - * WP=1 and NOT_WP=1 is an impossible combination, use WP and the - * SMAP variation to denote shadow EPT entries. - */ - role.base.cr0_wp = true; - role.base.smap_andnot_wp = true; - - role.ext = kvm_calc_mmu_role_ext(vcpu); + /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ + role.ext.word = 0; role.ext.execonly = execonly; + role.ext.valid = 1; return role; } @@ -4700,14 +4743,15 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); if (new_role.as_u64 == context->mmu_role.as_u64) return; + context->mmu_role.as_u64 = new_role.as_u64; + context->shadow_root_level = level; - context->nx = true; context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; @@ -4715,11 +4759,9 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->root_level = level; context->direct_map = false; - context->mmu_role.as_u64 = new_role.as_u64; - update_permission_bitmask(vcpu, context, true); - update_pkru_bitmask(vcpu, context, true); - update_last_nonleaf_level(vcpu, context); + update_permission_bitmask(context, true); + update_pkru_bitmask(context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4728,20 +4770,21 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - kvm_init_shadow_mmu(vcpu, - kvm_read_cr0_bits(vcpu, X86_CR0_PG), - kvm_read_cr4_bits(vcpu, X86_CR4_PAE), - vcpu->arch.efer); + kvm_init_shadow_mmu(vcpu, ®s); context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } -static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) +static union kvm_mmu_role +kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) { - union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false); + union kvm_mmu_role role; + + role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); /* * Nested MMUs are used only for walking L2's gva->gpa, they never have @@ -4749,23 +4792,14 @@ static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) * to "true" to try to detect bogus usage of the nested MMU. */ role.base.direct = true; - - if (!is_paging(vcpu)) - role.base.level = 0; - else if (is_long_mode(vcpu)) - role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - else if (is_pae(vcpu)) - role.base.level = PT32E_ROOT_LEVEL; - else - role.base.level = PT32_ROOT_LEVEL; - + role.base.level = role_regs_to_root_level(regs); return role; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { - union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu); + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); + union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_role.as_u64 == g_context->mmu_role.as_u64) @@ -4775,6 +4809,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->get_guest_pgd = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; + g_context->root_level = new_role.base.level; /* * L2 page tables are never shadowed, so there is no need to sync @@ -4790,44 +4825,20 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ - if (!is_paging(vcpu)) { - g_context->nx = false; - g_context->root_level = 0; + if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; - } else if (is_long_mode(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else if (is_pae(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else { - g_context->nx = false; - g_context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - } - update_permission_bitmask(vcpu, g_context, false); - update_pkru_bitmask(vcpu, g_context, false); - update_last_nonleaf_level(vcpu, g_context); + reset_guest_paging_metadata(vcpu, g_context); } -void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) +void kvm_init_mmu(struct kvm_vcpu *vcpu) { - if (reset_roots) { - uint i; - - vcpu->arch.mmu->root_hpa = INVALID_PAGE; - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - } - if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4840,20 +4851,53 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) { + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role role; if (tdp_enabled) - role = kvm_calc_tdp_mmu_root_page_role(vcpu, true); + role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); else - role = kvm_calc_shadow_mmu_root_page_role(vcpu, true); + role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); return role.base; } +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + /* + * Invalidate all MMU roles to force them to reinitialize as CPUID + * information is factored into reserved bit calculations. + */ + vcpu->arch.root_mmu.mmu_role.ext.valid = 0; + vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; + vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; + kvm_mmu_reset_context(vcpu); + + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise + * sweep the problem under the rug. + * + * KVM's horrific CPUID ABI makes the problem all but impossible to + * solve, as correctly handling multiple vCPU models (with respect to + * paging and physical address properties) in a single VM would require + * tracking all relevant CPUID information in kvm_mmu_page_role. That + * is very undesirable as it would double the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments), and in practice + * no sane VMM mucks with the core vCPU model on the fly. + */ + if (vcpu->arch.last_vmentry_cpu != -1) { + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); + } +} + void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - kvm_init_mmu(vcpu, true); + kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -5491,7 +5535,13 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - kvm_mmu_init_tdp_mmu(kvm); + if (!kvm_mmu_init_tdp_mmu(kvm)) + /* + * No smp_load/store wrappers needed here as we are in + * VM init and there cannot be any memslots / other threads + * accessing this struct kvm yet. + */ + kvm->arch.memslots_have_rmaps = true; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5514,29 +5564,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; - write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, + end - 1, true, flush); + } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - - write_unlock(&kvm->mmu_lock); - if (is_tdp_mmu_enabled(kvm)) { flush = false; @@ -5563,12 +5613,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, int start_level) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5636,18 +5689,17 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; - bool flush; - - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + bool flush = false; - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - read_lock(&kvm->mmu_lock); flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); if (flush) @@ -5674,11 +5726,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5981,6 +6036,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel static void kvm_recover_nx_lpages(struct kvm *kvm) { + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -5992,7 +6048,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d64ccb417c60..35567293c1fd 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -116,14 +116,19 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -bool is_nx_huge_page_enabled(void); -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); + struct kvm_memory_slot *slot, u64 gfn, + int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); @@ -158,8 +163,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp); -bool is_nx_huge_page_enabled(void); - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index e798489b56b5..efbad33a0645 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -40,7 +40,7 @@ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ + role.efer_nx ? "" : "!", \ role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 34bb0ec69bd8..91a9f7e0fd91 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -100,7 +100,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, kvm_mmu_gfn_disallow_lpage(slot, gfn); if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs(kvm); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 823a5919f9fa..490a028ddabe 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) return pkeys; } +static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, + unsigned int level, unsigned int gpte) +{ + /* + * For EPT and PAE paging (both variants), bit 7 is either reserved at + * all level or indicates a huge page (ignoring CR3/EPTP). In either + * case, bit 7 being set terminates the walk. + */ +#if PTTYPE == 32 + /* + * 32-bit paging requires special handling because bit 7 is ignored if + * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is + * greater than the last level for which bit 7 is the PAGE_SIZE bit. + * + * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 + * is not reserved and does not indicate a large page at this level, + * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse); +#endif + /* + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PG_LEVEL_4K - 1; + + return gpte & PT_PAGE_SIZE_MASK; +} /* * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ @@ -421,7 +450,7 @@ retry_walk: /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); - } while (!is_last_gpte(mmu, walker->level, pte)); + } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; @@ -471,8 +500,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) + if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; @@ -767,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, bool self_changed = false; if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_write_protection(vcpu) && !user_fault))) + (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) return false; for (level = walker->level; level <= walker->max_level; level++) { @@ -865,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * we will cache the incorrect access into mmio spte. */ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_write_protection(vcpu) && !user_fault && - !is_noslot_pfn(pfn)) { + !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -876,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * then we should prevent the kernel from executing it * if SMEP is enabled. */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } @@ -1031,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; int set_spte_ret = 0; - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || + (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 66d43cec0c31..3e97cdb13eb7 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -103,13 +103,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* - * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set - * if PAE paging may be employed (shadow paging or any 32-bit KVM). - */ - WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && - (spte & SPTE_TDP_AD_MASK)); - - /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent @@ -154,13 +147,19 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. + * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ if (!can_unsync && is_writable_pte(old_spte)) goto out; - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + /* + * Unsync shadow pages that are reachable by the new, writable + * SPTE. Write-protect the SPTE if the page can't be unsync'd, + * e.g. it's write-tracked (upper-level SPs) or has one or more + * shadow pages and unsync'ing pages is not allowed. + */ + if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; @@ -176,7 +175,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: - WARN_ON(is_mmio_spte(spte)); + WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), + "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, + get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + *new_spte = spte; return ret; } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index bca0ba11cccf..7a5ce9314107 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -293,6 +293,38 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, + int level) +{ + int bit7 = (pte >> 7) & 1; + + return rsvd_check->rsvd_bits_mask[bit7][level-1]; +} + +static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, + u64 pte, int level) +{ + return pte & get_rsvd_bits(rsvd_check, pte, level); +} + +static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, + u64 pte) +{ + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); +} + +static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + u64 spte, int level) +{ + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the reserved + * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc + * (this is extremely unlikely to be short-circuited as true). + */ + return __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, level); +} + static inline bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & shadow_host_writable_mask) && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 237317b1eddd..0853370bd811 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return; + return false; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; @@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + + return true; } static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, @@ -335,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, for (i = 0; i < PT64_ENT_PER_PAGE; i++) { sptep = rcu_dereference(pt) + i; - gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); if (shared) { /* @@ -377,12 +379,12 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, WRITE_ONCE(*sptep, REMOVED_SPTE); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level - 1, + old_child_spte, REMOVED_SPTE, level, shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -912,7 +914,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, kvm_pfn_t pfn, bool prefault) { u64 new_spte; - int ret = 0; + int ret = RET_PF_FIXED; int make_spte_ret = 0; if (unlikely(is_noslot_pfn(pfn))) @@ -949,7 +951,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, rcu_dereference(iter->sptep)); } - if (!prefault) + /* + * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be + * consistent with legacy MMU behavior. + */ + if (ret != RET_PF_SPURIOUS) vcpu->stat.pf_fixed++; return ret; @@ -977,11 +983,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level; int req_level; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -1017,14 +1018,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (!is_shadow_present_pte(iter.old_spte)) { /* - * If SPTE has been forzen by another thread, just + * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table * allocation and free. */ if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); child_pt = sp->spt; new_spte = make_nonleaf_spte(child_pt, @@ -1462,15 +1463,22 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t gfn) + gfn_t gfn, int min_level) { struct tdp_iter iter; u64 new_spte; bool spte_set = false; + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, gfn, gfn + 1) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + if (!is_writable_pte(iter.old_spte)) break; @@ -1492,14 +1500,15 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn) + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) - spte_set |= write_protect_gfn(kvm, root, gfn); + spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 5fdf63090451..1cae4485b3bc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); /* * Don't allow yielding, as the caller may have a flush pending. Note, @@ -74,37 +74,40 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn); + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level); int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); #ifdef CONFIG_X86_64 -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -#else -static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } -static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -#endif -static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { struct kvm_mmu_page *sp; + hpa_t hpa = mmu->root_hpa; - if (!is_tdp_mmu_enabled(kvm)) - return false; if (WARN_ON(!VALID_PAGE(hpa))) return false; + /* + * A NULL shadow page is legal when shadowing a non-paging guest with + * PAE paging, as the MMU will be direct with root_hpa pointing at the + * pae_root page, not a shadow page. + */ sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return is_tdp_mmu_page(sp) && sp->root_count; + return sp && is_tdp_mmu_page(sp) && sp->root_count; } +#else +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } +#endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5e7e920113f3..1d01da64c333 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,10 +27,6 @@ #include "irq.h" #include "svm.h" -/* enable / disable AVIC */ -bool avic; -module_param(avic, bool, S_IRUGO); - #define SVM_AVIC_DOORBELL 0xc001011b #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) @@ -124,7 +120,7 @@ void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); - if (!avic) + if (!enable_apicv) return; if (kvm_svm->avic_logical_id_table_page) @@ -147,7 +143,7 @@ int avic_vm_init(struct kvm *kvm) struct page *l_page; u32 vm_id; - if (!avic) + if (!enable_apicv) return 0; /* Allocating physical APIC ID table (4KB) */ @@ -240,7 +236,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT * memory region. So, we need to ensure that kvm->mm == current->mm. */ - if ((kvm->arch.apic_access_page_done == activate) || + if ((kvm->arch.apic_access_memslot_enabled == activate) || (kvm->mm != current->mm)) goto out; @@ -253,7 +249,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) goto out; } - kvm->arch.apic_access_page_done = activate; + kvm->arch.apic_access_memslot_enabled = activate; out: mutex_unlock(&kvm->slots_lock); return r; @@ -569,7 +565,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; - if (!avic || !irqchip_in_kernel(vcpu->kvm)) + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; ret = avic_init_backing_page(vcpu); @@ -593,7 +589,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) { - if (!avic || !lapic_in_kernel(vcpu)) + if (!enable_apicv || !lapic_in_kernel(vcpu)) return; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -653,7 +649,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; bool activated = kvm_vcpu_apicv_active(vcpu); - if (!avic) + if (!enable_apicv) return; if (activated) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5e8d8443154e..21d03e3a5dfd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,13 +98,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; + + /* + * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note, + * when called via KVM_SET_NESTED_STATE, that state may _not_ match current + * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required. + */ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -380,33 +385,47 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) +{ + /* + * TODO: optimize unconditional TLB flush/MMU sync. A partial list of + * things to fix before this can be conditional: + * + * - Flush TLBs for both L1 and L2 remote TLB flush + * - Honor L1's request to flush an ASID on nested VMRUN + * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*] + * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN + * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST + * + * [*] Unlike nested EPT, SVM's ASID management can invalidate nested + * NPT guest-physical mappings on VMRUN. + */ + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); +} + /* * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true * if we are emulating VM-Entry into a guest with NPT enabled. */ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, - bool nested_npt) + bool nested_npt, bool reload_pdptrs) { if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) return -EINVAL; - if (!nested_npt && is_pae_paging(vcpu) && - (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) - return -EINVAL; - } + if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && + CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + return -EINVAL; - /* - * TODO: optimize unconditional TLB flush/MMU sync here and in - * kvm_init_shadow_npt_mmu(). - */ if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3, false, false); + kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); - kvm_init_mmu(vcpu, false); + /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ + kvm_init_mmu(vcpu); return 0; } @@ -481,6 +500,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + struct kvm_vcpu *vcpu = &svm->vcpu; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, @@ -505,10 +525,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) /* nested_cr3. */ if (nested_npt_enabled(svm)) - nested_svm_init_mmu_context(&svm->vcpu); + nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = - svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = + vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & ~mask) | @@ -523,8 +543,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; + nested_svm_transition_tlb_flush(vcpu); + /* Enter Guest-Mode */ - enter_guest_mode(&svm->vcpu); + enter_guest_mode(vcpu); /* * Merge guest and host intercepts - must be called with vcpu in @@ -576,7 +598,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, - nested_npt_enabled(svm)); + nested_npt_enabled(svm), true); if (ret) return ret; @@ -596,8 +618,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct kvm_host_map map; u64 vmcb12_gpa; - ++vcpu->stat.nested_run; - if (is_smm(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -803,9 +823,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_vcpu_unmap(vcpu, &map, true); + nested_svm_transition_tlb_flush(vcpu); + nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false); + rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); if (rc) return 1; @@ -1228,8 +1250,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + unsigned long cr0; int ret; - u32 cr0; BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > KVM_STATE_NESTED_SVM_VMCB_SIZE); @@ -1302,6 +1324,19 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + + /* * All checks done, we can enter guest mode. Userspace provides * vmcb12.control, which will be combined with L1 and stored into * vmcb02, and the L1 save state which we store in vmcb01. @@ -1358,9 +1393,15 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (WARN_ON(!is_guest_mode(vcpu))) return true; - if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm))) - return false; + if (!vcpu->arch.pdptrs_from_userspace && + !nested_npt_enabled(svm) && is_pae_paging(vcpu)) + /* + * Reload the guest's PDPTRs since after a migration + * the guest CR3 might be restored prior to setting the nested + * state which can lead to a load of wrong PDPTRs. + */ + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + return false; if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e088086f3de6..8834822c00cd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -43,6 +43,9 @@ #include "svm.h" #include "svm_ops.h" +#include "kvm_onhyperv.h" +#include "svm_onhyperv.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -185,6 +188,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* + * enable / disable AVIC. Because the defaults differ for APICv + * support between VMX and SVM we cannot use module_param_named. + */ +static bool avic; +module_param(avic, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -673,6 +683,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); msrpm[offset] = tmp; + + svm_hv_vmcb_dirty_nested_enlightenments(vcpu); + } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -939,6 +952,16 @@ static __init int svm_hardware_setup(void) int r; unsigned int order = get_order(IOPM_SIZE); + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + iopm_pages = alloc_pages(GFP_KERNEL, order); if (!iopm_pages) @@ -952,9 +975,6 @@ static __init int svm_hardware_setup(void) supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -996,6 +1016,8 @@ static __init int svm_hardware_setup(void) /* Note, SEV setup consumes npt_enabled. */ sev_hardware_setup(); + svm_hv_hardware_setup(); + svm_adjust_mmio_mask(); for_each_possible_cpu(cpu) { @@ -1009,14 +1031,12 @@ static __init int svm_hardware_setup(void) nrips = false; } - if (avic) { - if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { - avic = false; - } else { - pr_info("AVIC enabled\n"); + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } if (vls) { @@ -1080,26 +1100,30 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - if (is_guest_mode(vcpu)) { - /* Write L1's TSC offset. */ - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->vmcb01.ptr->control.tsc_offset; - svm->vmcb01.ptr->control.tsc_offset = offset; - } + return svm->nested.ctl.tsc_offset; +} - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - g_tsc_offset, - offset); +static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + return kvm_default_tsc_scaling_ratio; +} - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; + svm->vmcb->control.tsc_offset = offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - return svm->vmcb->control.tsc_offset; +} + +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } /* Evaluate instruction intercepts that depend on guest CPUID features. */ @@ -1287,6 +1311,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } } + svm_hv_init_vmcb(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -3106,6 +3132,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) return; } + pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", + svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3762,6 +3790,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm_hv_update_vp_id(svm->vmcb, vcpu); + /* * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. @@ -3835,6 +3865,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm->next_rip = 0; if (is_guest_mode(vcpu)) { nested_sync_control_from_vmcb02(svm); + + /* Track VMRUNs that have made past consistency checking */ + if (svm->nested.nested_run_pending && + svm->vmcb->control.exit_code != SVM_EXIT_ERR) + ++vcpu->stat.nested_run; + svm->nested.nested_run_pending = 0; } @@ -3846,10 +3882,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) { - vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); - vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); - } + if (npt_enabled) + kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); /* * We need to handle MC intercepts here before the vcpu has a chance to @@ -3877,6 +3911,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); + hv_track_root_tdp(vcpu, root_hpa); + /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; @@ -4249,7 +4285,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !svm_smi_blocked(vcpu); } -static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -4271,7 +4307,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map; @@ -4427,13 +4463,12 @@ static int svm_vm_init(struct kvm *kvm) if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; - if (avic) { + if (enable_apicv) { int ret = avic_vm_init(kvm); if (ret) return ret; } - kvm_apicv_init(kvm, avic); return 0; } @@ -4524,7 +4559,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .write_l1_tsc_offset = svm_write_l1_tsc_offset, + .get_l2_tsc_offset = svm_get_l2_tsc_offset, + .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, + .write_tsc_offset = svm_write_tsc_offset, + .write_tsc_multiplier = svm_write_tsc_multiplier, .load_mmu_pgd = svm_load_mmu_pgd, @@ -4544,8 +4582,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, - .pre_enter_smm = svm_pre_enter_smm, - .pre_leave_smm = svm_pre_leave_smm, + .enter_smm = svm_enter_smm, + .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, .mem_enc_op = svm_mem_enc_op, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2908c6ab5bb4..f89b623bb591 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,6 +32,11 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +/* + * Clean bits in VMCB. + * VMCB_ALL_CLEAN_MASK might also need to + * be updated if this enum is modified. + */ enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, pause filter count */ @@ -49,9 +54,17 @@ enum { * AVIC PHYSICAL_TABLE pointer, * AVIC LOGICAL_TABLE pointer */ - VMCB_DIRTY_MAX, + VMCB_SW = 31, /* Reserved for hypervisor/software use */ }; +#define VMCB_ALL_CLEAN_MASK ( \ + (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \ + (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ + (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ + (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ + (1U << VMCB_SW)) + /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) @@ -238,10 +251,15 @@ static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) static inline void vmcb_mark_all_clean(struct vmcb *vmcb) { - vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) + vmcb->control.clean = VMCB_ALL_CLEAN_MASK & ~VMCB_ALWAYS_DIRTY_MASK; } +static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) +{ + return (vmcb->control.clean & (1 << bit)); +} + static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); @@ -480,8 +498,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern bool avic; - static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c new file mode 100644 index 000000000000..98aa981c04ec --- /dev/null +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM L1 hypervisor optimizations on Hyper-V for SVM. + */ + +#include <linux/kvm_host.h> +#include "kvm_cache_regs.h" + +#include <asm/mshyperv.h> + +#include "svm.h" +#include "svm_ops.h" + +#include "hyperv.h" +#include "kvm_onhyperv.h" +#include "svm_onhyperv.h" + +int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightenments *hve; + struct hv_partition_assist_pg **p_hv_pa_pg = + &to_kvm_hv(vcpu->kvm)->hv_pa_pg; + + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (!*p_hv_pa_pg) + return -ENOMEM; + + hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw; + + hve->partition_assist_page = __pa(*p_hv_pa_pg); + hve->hv_vm_id = (unsigned long)vcpu->kvm; + if (!hve->hv_enlightenments_control.nested_flush_hypercall) { + hve->hv_enlightenments_control.nested_flush_hypercall = 1; + vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + } + + return 0; +} + diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h new file mode 100644 index 000000000000..9b9a55abc29f --- /dev/null +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM L1 hypervisor optimizations on Hyper-V for SVM. + */ + +#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ +#define __ARCH_X86_KVM_SVM_ONHYPERV_H__ + +#if IS_ENABLED(CONFIG_HYPERV) +#include <asm/mshyperv.h> + +#include "hyperv.h" +#include "kvm_onhyperv.h" + +static struct kvm_x86_ops svm_x86_ops; + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + +int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); + +static inline void svm_hv_init_vmcb(struct vmcb *vmcb) +{ + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) + hve->hv_enlightenments_control.enlightened_npt_tlb = 1; +} + +static inline void svm_hv_hardware_setup(void) +{ + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { + pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; + svm_x86_ops.tlb_remote_flush_with_range = + hv_remote_flush_tlb_with_range; + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { + int cpu; + + pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + for_each_online_cpu(cpu) { + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->nested_control.features.directhypercall = 1; + } + svm_x86_ops.enable_direct_tlbflush = + svm_hv_enable_direct_tlbflush; + } +} + +static inline void svm_hv_vmcb_dirty_nested_enlightenments( + struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + /* + * vmcb can be NULL if called during early vcpu init. + * And its okay not to mark vmcb dirty during vcpu init + * as we mark it dirty unconditionally towards end of vcpu + * init phase. + */ + if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && + hve->hv_enlightenments_control.msr_bitmap) + vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); +} + +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, + struct kvm_vcpu *vcpu) +{ + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + u32 vp_index = kvm_hv_get_vpindex(vcpu); + + if (hve->hv_vp_id != vp_index) { + hve->hv_vp_id = vp_index; + vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + } +} +#else + +static inline void svm_hv_init_vmcb(struct vmcb *vmcb) +{ +} + +static inline void svm_hv_hardware_setup(void) +{ +} + +static inline void svm_hv_vmcb_dirty_nested_enlightenments( + struct kvm_vcpu *vcpu) +{ +} + +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, + struct kvm_vcpu *vcpu) +{ +} +#endif /* CONFIG_HYPERV */ + +#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4f839148948b..b484141ea15b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -997,7 +997,7 @@ TRACE_EVENT(kvm_wait_lapic_expire, __entry->delta < 0 ? "early" : "late") ); -TRACE_EVENT(kvm_enter_smm, +TRACE_EVENT(kvm_smm_transition, TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), TP_ARGS(vcpu_id, smbase, entering), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index aa0e7872fcc9..4705ad55abb5 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -12,7 +12,6 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; -extern bool __read_mostly enable_apicv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 41f24661af04..896b2a50b4aa 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -319,6 +319,9 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) if (unlikely(!assist_page.enlighten_vmentry)) return false; + if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) + return false; + *evmcs_gpa = assist_page.current_nested_vmcs; return true; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index bd41d9462355..2ec9b46f0d0c 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -197,6 +197,14 @@ static inline void evmcs_load(u64 phys_addr) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +#define EVMPTR_INVALID (-1ULL) +#define EVMPTR_MAP_PENDING (-2ULL) + +static inline bool evmptr_is_valid(u64 evmptr) +{ + return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; +} + enum nested_evmptrld_status { EVMPTRLD_DISABLED, EVMPTRLD_SUCCEEDED, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6058a65a6ede..1a52134b0c42 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -173,9 +173,13 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | X86_EFLAGS_ZF); get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed + * We don't need to force sync to shadow VMCS because + * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all + * fields and thus must be synced. */ + if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; + return kvm_skip_emulated_instruction(vcpu); } @@ -187,7 +191,8 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + if (vmx->nested.current_vmptr == -1ull && + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); @@ -221,12 +226,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->nested.hv_evmcs) - return; + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + vmx->nested.hv_evmcs = NULL; + } - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs_vmptr = 0; - vmx->nested.hv_evmcs = NULL; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -346,16 +351,21 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) +{ + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.msrs.ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT, + nested_ept_ad_enabled(vcpu), + nested_ept_get_eptp(vcpu)); +} + static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu), - nested_ept_get_eptp(vcpu)); + nested_ept_new_eptp(vcpu); vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; @@ -1058,54 +1068,13 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } /* - * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. - * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't - * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). - * Here's why. - * - * If EPT is enabled by L0 a sync is never needed: - * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there - * cannot be unsync'd SPTEs for either L1 or L2. - * - * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter - * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings - * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush - * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't - * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. - * - * If EPT is disabled by L0: - * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 - * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't - * required to invalidate linear mappings (EPT is disabled so there are - * no combined or guest-physical mappings), i.e. L1 can't rely on the - * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). - * - * - however if VPID is disabled by L1, then a sync is needed as L1 expects all - * linear mappings (EPT is disabled so there are no combined or guest-physical - * mappings) to be invalidated on both VM-Enter and VM-Exit. - * - * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which - * additionally checks that L2 has been assigned a VPID (when EPT is disabled). - * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect - * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 - * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has - * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 - * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't - * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush - * stale TLB entries, at which point L0 will sync L2's MMU. - */ -static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) -{ - return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); -} - -/* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ -static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { @@ -1117,27 +1086,20 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ - if (!nested_ept && is_pae_paging(vcpu) && - (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return -EINVAL; - } + if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && + CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return -EINVAL; } - /* - * Unconditionally skip the TLB flush on fast CR3 switch, all TLB - * flushes are handled by nested_vmx_transition_tlb_flush(). See - * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. - */ if (!nested_ept) - kvm_mmu_new_pgd(vcpu, cr3, true, - !nested_vmx_transition_mmu_sync(vcpu)); + kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); - kvm_init_mmu(vcpu, false); + /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ + kvm_init_mmu(vcpu); return 0; } @@ -1170,17 +1132,28 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * If VPID is disabled, linear and combined mappings are flushed on - * VM-Enter/VM-Exit, and guest-physical mappings are valid only for - * their associated EPTP. + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a + * full TLB flush from the guest's perspective. This is required even + * if VPID is disabled in the host as KVM may need to synchronize the + * MMU in response to the guest TLB flush. + * + * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. + * EPT is a special snowflake, as guest-physical mappings aren't + * flushed on VPID invalidations, including VM-Enter or VM-Exit with + * VPID disabled. As a result, KVM _never_ needs to sync nEPT + * entries on VM-Enter because L1 can't rely on VM-Enter to flush + * those mappings. */ - if (!enable_vpid) + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; + } + + /* L2 should never have a VPID if VPID is disabled. */ + WARN_ON(!enable_vpid); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit. - * * If VPID is enabled and used by vmc12, but L2 does not have a unique * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate * a VPID for L2, flush the current context as the effective ASID is @@ -1192,13 +1165,12 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, * * If a TLB flush isn't required due to any of the above, and vpid12 is * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be sync'd. There is no direct + * "real" VPID (vpid02), and so needs to be flushed. There's no direct * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. + * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate + * guest-physical mappings, so there is no need to sync the nEPT MMU. */ - if (!nested_cpu_has_vpid(vmcs12)) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } else if (!nested_has_guest_tlb_tag(vcpu)) { + if (!nested_has_guest_tlb_tag(vcpu)) { kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } else if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { @@ -1586,7 +1558,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } -static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -1595,7 +1567,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; @@ -1603,23 +1575,23 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->guest_interruptibility_info; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { vmcs12->cpu_based_vm_exec_control = evmcs->cpu_based_vm_exec_control; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { vmcs12->exception_bitmap = evmcs->exception_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { vmcs12->vm_entry_controls = evmcs->vm_entry_controls; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { vmcs12->vm_entry_intr_info_field = evmcs->vm_entry_intr_info_field; @@ -1629,7 +1601,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->vm_entry_instruction_len; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { vmcs12->host_ia32_pat = evmcs->host_ia32_pat; vmcs12->host_ia32_efer = evmcs->host_ia32_efer; @@ -1649,7 +1621,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->host_tr_selector = evmcs->host_tr_selector; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { vmcs12->pin_based_vm_exec_control = evmcs->pin_based_vm_exec_control; @@ -1658,18 +1630,18 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->secondary_vm_exec_control; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { vmcs12->io_bitmap_a = evmcs->io_bitmap_a; vmcs12->io_bitmap_b = evmcs->io_bitmap_b; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { vmcs12->msr_bitmap = evmcs->msr_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { vmcs12->guest_es_base = evmcs->guest_es_base; vmcs12->guest_cs_base = evmcs->guest_cs_base; @@ -1709,14 +1681,14 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->guest_tr_selector = evmcs->guest_tr_selector; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { vmcs12->tsc_offset = evmcs->tsc_offset; vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; @@ -1728,7 +1700,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->guest_dr7 = evmcs->guest_dr7; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { vmcs12->host_fs_base = evmcs->host_fs_base; vmcs12->host_gs_base = evmcs->host_gs_base; @@ -1738,13 +1710,13 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->host_rsp = evmcs->host_rsp; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { vmcs12->ept_pointer = evmcs->ept_pointer; vmcs12->virtual_processor_id = evmcs->virtual_processor_id; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; @@ -1799,10 +1771,10 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; */ - return 0; + return; } -static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -1962,7 +1934,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; - return 0; + return; } /* @@ -1979,13 +1951,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (likely(!vmx->nested.enlightened_vmcs_enabled)) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) + if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; + } - if (unlikely(!vmx->nested.hv_evmcs || - evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - if (!vmx->nested.hv_evmcs) - vmx->nested.current_vmptr = -1ull; + if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + vmx->nested.current_vmptr = -1ull; nested_release_evmcs(vcpu); @@ -2023,7 +1995,6 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( return EVMPTRLD_VMFAIL; } - vmx->nested.dirty_vmcs12 = true; vmx->nested.hv_evmcs_vmptr = evmcs_gpa; evmcs_gpa_changed = true; @@ -2056,14 +2027,10 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.hv_evmcs) { + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) copy_vmcs12_to_enlightened(vmx); - /* All fields are clean */ - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - } else { + else copy_vmcs12_to_shadow(vmx); - } vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -2208,7 +2175,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) prepare_vmcs02_early_rare(vmx, vmcs12); /* @@ -2277,7 +2244,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_ENABLE_VMFUNC); + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_TSC_SCALING); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2488,18 +2456,18 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || hv_evmcs) { + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - load_guest_pdptrs_vmcs12 = !hv_evmcs || - !(hv_evmcs->hv_clean_fields & + load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || + !(vmx->nested.hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } @@ -2532,10 +2500,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + vmx_get_l2_tsc_offset(vcpu), + vmx_get_l2_tsc_multiplier(vcpu)); + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + vcpu->arch.l1_tsc_scaling_ratio, + vmx_get_l2_tsc_multiplier(vcpu)); + + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -2572,7 +2548,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), - entry_failure_code)) + from_vmentry, entry_failure_code)) return -EINVAL; /* @@ -2604,6 +2580,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); + + /* + * It was observed that genuine Hyper-V running in L1 doesn't reset + * 'hv_clean_fields' by itself, it only sets the corresponding dirty + * bits when it changes a field in eVMCS. Mark all fields as clean + * here. + */ + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + return 0; } @@ -3093,13 +3080,20 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { + if (vmx->nested.enlightened_vmcs_enabled && + vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); if (evmptrld_status == EVMPTRLD_VMFAIL || evmptrld_status == EVMPTRLD_ERROR) return false; + + /* + * Post migration VMCS12 always provides the most actual + * information, copy it to eVMCS upon entry. + */ + vmx->nested.need_vmcs12_to_shadow_sync = true; } return true; @@ -3113,6 +3107,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; + if (!vcpu->arch.pdptrs_from_userspace && + !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + /* + * Reload the guest's PDPTRs since after a migration + * the guest CR3 might be restored prior to setting the nested + * state which can lead to a load of wrong PDPTRs. + */ + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + return false; + } + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3175,6 +3181,15 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) offset_in_page(vmcs12->posted_intr_desc_addr)); vmcs_write64(POSTED_INTR_DESC_ADDR, pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); + } else { + /* + * Defer the KVM_INTERNAL_EXIT until KVM tries to + * access the contents of the VMCS12 posted interrupt + * descriptor. (Note that KVM may do this when it + * should not, per the architectural specification.) + */ + vmx->nested.pi_desc = NULL; + pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) @@ -3354,10 +3369,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset += vmcs12->tsc_offset; - if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; @@ -3437,7 +3450,7 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; - if (enable_shadow_vmcs || vmx->nested.hv_evmcs) + if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } @@ -3454,8 +3467,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; - ++vcpu->stat.nested_run; - if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3467,7 +3478,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return nested_vmx_failInvalid(vcpu); } - if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3481,8 +3493,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); - if (vmx->nested.hv_evmcs) { - copy_enlightened_to_vmcs12(vmx); + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { @@ -3682,25 +3694,29 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) } } -static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; void *vapic_page; u16 status; - if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) - return; + if (!vmx->nested.pi_pending) + return 0; + + if (!vmx->nested.pi_desc) + goto mmio_needed; vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; + return 0; max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) - return; + goto mmio_needed; __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); @@ -3713,6 +3729,11 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } nested_mark_vmcs12_pages_dirty(vcpu); + return 0; + +mmio_needed: + kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + return -ENXIO; } static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, @@ -3887,8 +3908,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } no_vmexit: - vmx_complete_nested_posted_interrupt(vcpu); - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -4032,10 +4052,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -4206,7 +4227,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); @@ -4463,8 +4484,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { + vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + } if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); @@ -4501,12 +4525,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); + if (vmx->nested.l1_tpr_threshold != -1) vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); @@ -4532,7 +4556,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } if ((vm_exit_reason != -1) && - (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ @@ -4987,6 +5011,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); + } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { + nested_release_evmcs(vcpu); } return nested_vmx_succeed(vcpu); @@ -5228,7 +5254,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return 1; if (vmx->nested.current_vmptr != vmptr) { @@ -5284,7 +5310,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, @@ -5461,8 +5487,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* * Sync the shadow page tables if EPT is disabled, L1 is invalidating - * linear mappings for L2 (tagged with L2's VPID). Free all roots as - * VPIDs are not tracked in the MMU role. + * linear mappings for L2 (tagged with L2's VPID). Free all guest + * roots as VPIDs are not tracked in the MMU role. * * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share * an MMU when EPT is disabled. @@ -5470,8 +5496,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. */ if (!enable_ept) - kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, - KVM_MMU_ROOTS_ALL); + kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); return nested_vmx_succeed(vcpu); } @@ -5481,23 +5506,16 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, { u32 index = kvm_rcx_read(vcpu); u64 new_eptp; - bool accessed_dirty; - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (!nested_cpu_has_eptp_switching(vmcs12) || - !nested_cpu_has_ept(vmcs12)) + if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) return 1; - if (index >= VMFUNC_EPTP_ENTRIES) return 1; - if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, &new_eptp, index * 8, 8)) return 1; - accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); - /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else @@ -5506,11 +5524,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; - mmu->ept_ad = accessed_dirty; - mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = new_eptp; + nested_ept_new_eptp(vcpu); - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + if (!nested_cpu_has_vpid(vmcs12)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } return 0; @@ -5533,7 +5551,17 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) } vmcs12 = get_vmcs12(vcpu); - if ((vmcs12->vm_function_control & (1 << function)) == 0) + + /* + * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC + * is enabled in vmcs02 if and only if it's enabled in vmcs12. + */ + if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail; switch (function) { @@ -5806,6 +5834,9 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return true; + else if (is_alignment_check(intr_info) && + !vmx_guest_inject_ac(vcpu)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; @@ -6056,7 +6087,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); - if (vmx->nested.hv_evmcs) + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6112,8 +6144,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + /* + * L1 hypervisor is not obliged to keep eVMCS + * clean fields data always up-to-date while + * not in guest mode, 'hv_clean_fields' is only + * supposed to be actual upon vmentry so we need + * to ignore it here and do full copy. + */ + copy_enlightened_to_vmcs12(vmx, 0); else if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); } @@ -6255,6 +6294,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ + vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; @@ -6339,6 +6379,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) } /* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo + * that madness to get the encoding for comparison. + */ +#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) + +static u64 nested_vmx_calc_vmcs_enum_msr(void) +{ + /* + * Note these are the so called "index" of the VMCS field encoding, not + * the index into vmcs12. + */ + unsigned int max_idx, idx; + int i; + + /* + * For better or worse, KVM allows VMREAD/VMWRITE to all fields in + * vmcs12, regardless of whether or not the associated feature is + * exposed to L1. Simply find the field with the highest index. + */ + max_idx = 0; + for (i = 0; i < nr_vmcs12_fields; i++) { + /* The vmcs12 table is very, very sparsely populated. */ + if (!vmcs_field_to_offset_table[i]) + continue; + + idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); + if (idx > max_idx) + max_idx = idx; + } + + return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; +} + +/* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. * The same values should also be used to verify that vmcs12 control fields are @@ -6474,7 +6548,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_TSC_SCALING; /* * We can emulate "VMCS shadowing," even if the hardware @@ -6582,8 +6657,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } void nested_vmx_hardware_unsetup(void) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 184418baeb3c..b69a80f43b37 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -56,14 +56,9 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * In case we do two consecutive get/set_nested_state()s while L2 was - * running hv_evmcs may end up not being mapped (we map it from - * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always - * have vmcs12 if it is true. - */ - return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs; + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + return vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID; } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 1472c6c376f7..4b9957e2bf5b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -117,6 +117,11 @@ static inline bool is_gp_fault(u32 intr_info) return is_exception_n(intr_info, GP_VECTOR); } +static inline bool is_alignment_check(u32 intr_info) +{ + return is_exception_n(intr_info, AC_VECTOR); +} + static inline bool is_machine_check(u32 intr_info) { return is_exception_n(intr_info, MC_VECTOR); @@ -164,4 +169,12 @@ static inline int vmcs_field_readonly(unsigned long field) return (((field >> 10) & 0x3) == 1); } +#define VMCS_FIELD_INDEX_SHIFT (1) +#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1) + +static inline unsigned int vmcs_field_index(unsigned long field) +{ + return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT; +} + #endif /* __KVM_X86_VMX_VMCS_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 034adb6404dc..d9f5d7c56ae3 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -37,6 +37,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), FIELD64(PML_ADDRESS, pml_address), FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(TSC_MULTIPLIER, tsc_multiplier), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 13494956d0e9..5e0e1b39f495 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -70,7 +70,8 @@ struct __packed vmcs12 { u64 eptp_list_address; u64 pml_address; u64 encls_exiting_bitmap; - u64 padding64[2]; /* room for future expansion */ + u64 tsc_multiplier; + u64 padding64[1]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -205,12 +206,6 @@ struct __packed vmcs12 { #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - -/* * For save/restore compatibility, the vmcs12 field offsets must not change. */ #define CHECK_OFFSET(field, loc) \ @@ -258,6 +253,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); CHECK_OFFSET(encls_exiting_bitmap, 320); + CHECK_OFFSET(tsc_multiplier, 328); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c2a779b688e6..927a552393b9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -52,6 +52,7 @@ #include "cpuid.h" #include "evmcs.h" #include "hyperv.h" +#include "kvm_onhyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" @@ -101,7 +102,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); /* @@ -459,86 +459,6 @@ static unsigned long host_idt_base; static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, - void *data) -{ - struct kvm_tlb_range *range = data; - - return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, - range->pages); -} - -static inline int hv_remote_flush_root_ept(hpa_t root_ept, - struct kvm_tlb_range *range) -{ - if (range) - return hyperv_flush_guest_mapping_range(root_ept, - kvm_fill_hv_flush_list_func, (void *)range); - else - return hyperv_flush_guest_mapping(root_ept); -} - -static int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) -{ - struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - struct kvm_vcpu *vcpu; - int ret = 0, i, nr_unique_valid_roots; - hpa_t root; - - spin_lock(&kvm_vmx->hv_root_ept_lock); - - if (!VALID_PAGE(kvm_vmx->hv_root_ept)) { - nr_unique_valid_roots = 0; - - /* - * Flush all valid roots, and see if all vCPUs have converged - * on a common root, in which case future flushes can skip the - * loop and flush the common root. - */ - kvm_for_each_vcpu(i, vcpu, kvm) { - root = to_vmx(vcpu)->hv_root_ept; - if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept) - continue; - - /* - * Set the tracked root to the first valid root. Keep - * this root for the entirety of the loop even if more - * roots are encountered as a low effort optimization - * to avoid flushing the same (first) root again. - */ - if (++nr_unique_valid_roots == 1) - kvm_vmx->hv_root_ept = root; - - if (!ret) - ret = hv_remote_flush_root_ept(root, range); - - /* - * Stop processing roots if a failure occurred and - * multiple valid roots have already been detected. - */ - if (ret && nr_unique_valid_roots > 1) - break; - } - - /* - * The optimized flush of a single root can't be used if there - * are multiple valid roots (obviously). - */ - if (nr_unique_valid_roots > 1) - kvm_vmx->hv_root_ept = INVALID_PAGE; - } else { - ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range); - } - - spin_unlock(&kvm_vmx->hv_root_ept_lock); - return ret; -} -static int hv_remote_flush_tlb(struct kvm *kvm) -{ - return hv_remote_flush_tlb_with_range(kvm, NULL); -} - static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; @@ -566,21 +486,6 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) #endif /* IS_ENABLED(CONFIG_HYPERV) */ -static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept) -{ -#if IS_ENABLED(CONFIG_HYPERV) - struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); - - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { - spin_lock(&kvm_vmx->hv_root_ept_lock); - to_vmx(vcpu)->hv_root_ept = root_ept; - if (root_ept != kvm_vmx->hv_root_ept) - kvm_vmx->hv_root_ept = INVALID_PAGE; - spin_unlock(&kvm_vmx->hv_root_ept_lock); - } -#endif -} - /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -842,16 +747,21 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; else { - /* - * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched - * between guest and host. In that case we only care about present - * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in - * prepare_vmcs02_rare. - */ - bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR)); - int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0; + int mask = 0, match = 0; + + if (enable_ept && (eb & (1u << PF_VECTOR))) { + /* + * If EPT is enabled, #PF is currently only intercepted + * if MAXPHYADDR is smaller on the guest than on the + * host. In that case we only care about present, + * non-reserved faults. For vmcs02, however, PFEC_MASK + * and PFEC_MATCH are set in prepare_vmcs02_rare. + */ + mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; + match = PFERR_PRESENT_MASK; + } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } vmcs_write32(EXCEPTION_BITMAP, eb); @@ -1390,11 +1300,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, vmx->loaded_vmcs->cpu = cpu; } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); } /* @@ -1787,26 +1692,35 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->guest_uret_msrs_loaded = false; } -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u64 g_tsc_offset = 0; - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - g_tsc_offset = vmcs12->tsc_offset; + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) + return vmcs12->tsc_offset; + + return 0; +} + +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + return vmcs12->tsc_multiplier; - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vcpu->arch.tsc_offset - g_tsc_offset, - offset); - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); - return offset + g_tsc_offset; + return kvm_default_tsc_scaling_ratio; +} + +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + vmcs_write64(TSC_MULTIPLIER, multiplier); } /* @@ -3181,7 +3095,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); - hv_track_root_ept(vcpu, root_hpa); + hv_track_root_tdp(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; @@ -3707,7 +3621,7 @@ static int alloc_apic_access_page(struct kvm *kvm) int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) + if (kvm->arch.apic_access_memslot_enabled) goto out; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); @@ -3727,7 +3641,7 @@ static int alloc_apic_access_page(struct kvm *kvm) * is able to migrate it. */ put_page(page); - kvm->arch.apic_access_page_done = true; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return ret; @@ -4829,7 +4743,7 @@ static int handle_machine_check(struct kvm_vcpu *vcpu) * - Guest has #AC detection enabled in CR0 * - Guest EFLAGS has AC bit set */ -static inline bool guest_inject_ac(struct kvm_vcpu *vcpu) +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) { if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; @@ -4937,7 +4851,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: - if (guest_inject_ac(vcpu)) { + if (vmx_guest_inject_ac(vcpu)) { kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; } @@ -5810,6 +5724,8 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", + vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), @@ -6806,7 +6722,18 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_load_host_xsave_state(vcpu); - vmx->nested.nested_run_pending = 0; + if (is_guest_mode(vcpu)) { + /* + * Track VMLAUNCH/VMRESUME that have made past guest state + * checking. + */ + if (vmx->nested.nested_run_pending && + !vmx->exit_reason.failed_vmentry) + ++vcpu->stat.nested_run; + + vmx->nested.nested_run_pending = 0; + } + vmx->idt_vectoring_info = 0; if (unlikely(vmx->fail)) { @@ -6941,6 +6868,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; @@ -6952,9 +6880,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; -#if IS_ENABLED(CONFIG_HYPERV) - vmx->hv_root_ept = INVALID_PAGE; -#endif return 0; free_vmcs: @@ -6971,10 +6896,6 @@ free_vpid: static int vmx_vm_init(struct kvm *kvm) { -#if IS_ENABLED(CONFIG_HYPERV) - spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock); -#endif - if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7001,7 +6922,6 @@ static int vmx_vm_init(struct kvm *kvm) break; } } - kvm_apicv_init(kvm, enable_apicv); return 0; } @@ -7453,10 +7373,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, &delta_tsc)) + vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* @@ -7542,7 +7462,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7556,7 +7476,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7700,7 +7620,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, + .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, .load_mmu_pgd = vmx_load_mmu_pgd, @@ -7731,8 +7654,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, .smi_allowed = vmx_smi_allowed, - .pre_enter_smm = vmx_pre_enter_smm, - .pre_leave_smm = vmx_pre_leave_smm, + .enter_smm = vmx_enter_smm, + .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, .can_emulate_instruction = vmx_can_emulate_instruction, @@ -7807,6 +7730,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_invept_global()) enable_ept = 0; + /* NX support is required for shadow paging. */ + if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; @@ -7996,6 +7925,8 @@ static void vmx_exit(void) } #endif vmx_cleanup_l1d_flush(); + + allow_smaller_maxphyaddr = false; } module_exit(vmx_exit); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 16e4e457ba23..3979a947933a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -322,8 +322,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - u64 current_tsc_ratio; - unsigned long host_debugctlmsr; /* @@ -336,10 +334,6 @@ struct vcpu_vmx { /* SGX Launch Control public key hash */ u64 msr_ia32_sgxlepubkeyhash[4]; -#if IS_ENABLED(CONFIG_HYPERV) - u64 hv_root_ept; -#endif - struct pt_desc pt_desc; struct lbr_desc lbr_desc; @@ -357,11 +351,6 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - -#if IS_ENABLED(CONFIG_HYPERV) - hpa_t hv_root_ept; - spinlock_t hv_root_ept_lock; -#endif }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -387,6 +376,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -404,6 +394,9 @@ void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); + static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { @@ -529,12 +522,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { return vmx->secondary_exec_control & diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e0f4a46649d7..17468d983fbd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -58,6 +58,7 @@ #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> #include <linux/entry-kvm.h> +#include <linux/suspend.h> #include <trace/events/kvm.h> @@ -102,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -113,6 +116,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); + struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -209,55 +215,78 @@ EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +bool __read_mostly enable_apicv = true; +EXPORT_SYMBOL_GPL(enable_apicv); + u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("pf_fixed", pf_fixed), - VCPU_STAT("pf_guest", pf_guest), - VCPU_STAT("tlb_flush", tlb_flush), - VCPU_STAT("invlpg", invlpg), - VCPU_STAT("exits", exits), - VCPU_STAT("io_exits", io_exits), - VCPU_STAT("mmio_exits", mmio_exits), - VCPU_STAT("signal_exits", signal_exits), - VCPU_STAT("irq_window", irq_window_exits), - VCPU_STAT("nmi_window", nmi_window_exits), - VCPU_STAT("halt_exits", halt_exits), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("hypercalls", hypercalls), - VCPU_STAT("request_irq", request_irq_exits), - VCPU_STAT("irq_exits", irq_exits), - VCPU_STAT("host_state_reload", host_state_reload), - VCPU_STAT("fpu_reload", fpu_reload), - VCPU_STAT("insn_emulation", insn_emulation), - VCPU_STAT("insn_emulation_fail", insn_emulation_fail), - VCPU_STAT("irq_injections", irq_injections), - VCPU_STAT("nmi_injections", nmi_injections), - VCPU_STAT("req_event", req_event), - VCPU_STAT("l1d_flush", l1d_flush), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VCPU_STAT("nested_run", nested_run), - VCPU_STAT("directed_yield_attempted", directed_yield_attempted), - VCPU_STAT("directed_yield_successful", directed_yield_successful), - VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), - VM_STAT("mmu_pte_write", mmu_pte_write), - VM_STAT("mmu_pde_zapped", mmu_pde_zapped), - VM_STAT("mmu_flooded", mmu_flooded), - VM_STAT("mmu_recycled", mmu_recycled), - VM_STAT("mmu_cache_miss", mmu_cache_miss), - VM_STAT("mmu_unsync", mmu_unsync), - VM_STAT("remote_tlb_flush", remote_tlb_flush), - VM_STAT("largepages", lpages, .mode = 0444), - VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), - VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), - { NULL } +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, mmu_shadow_zapped), + STATS_DESC_COUNTER(VM, mmu_pte_write), + STATS_DESC_COUNTER(VM, mmu_pde_zapped), + STATS_DESC_COUNTER(VM, mmu_flooded), + STATS_DESC_COUNTER(VM, mmu_recycled), + STATS_DESC_COUNTER(VM, mmu_cache_miss), + STATS_DESC_ICOUNTER(VM, mmu_unsync), + STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, pf_fixed), + STATS_DESC_COUNTER(VCPU, pf_guest), + STATS_DESC_COUNTER(VCPU, tlb_flush), + STATS_DESC_COUNTER(VCPU, invlpg), + STATS_DESC_COUNTER(VCPU, exits), + STATS_DESC_COUNTER(VCPU, io_exits), + STATS_DESC_COUNTER(VCPU, mmio_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, irq_window_exits), + STATS_DESC_COUNTER(VCPU, nmi_window_exits), + STATS_DESC_COUNTER(VCPU, l1d_flush), + STATS_DESC_COUNTER(VCPU, halt_exits), + STATS_DESC_COUNTER(VCPU, request_irq_exits), + STATS_DESC_COUNTER(VCPU, irq_exits), + STATS_DESC_COUNTER(VCPU, host_state_reload), + STATS_DESC_COUNTER(VCPU, fpu_reload), + STATS_DESC_COUNTER(VCPU, insn_emulation), + STATS_DESC_COUNTER(VCPU, insn_emulation_fail), + STATS_DESC_COUNTER(VCPU, hypercalls), + STATS_DESC_COUNTER(VCPU, irq_injections), + STATS_DESC_COUNTER(VCPU, nmi_injections), + STATS_DESC_COUNTER(VCPU, req_event), + STATS_DESC_COUNTER(VCPU, nested_run), + STATS_DESC_COUNTER(VCPU, directed_yield_attempted), + STATS_DESC_COUNTER(VCPU, directed_yield_successful), + STATS_DESC_ICOUNTER(VCPU, guest_mode) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), }; u64 __read_mostly host_xcr0; @@ -778,13 +807,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -819,6 +841,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.pdptrs_from_userspace = false; out: @@ -826,40 +849,14 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -bool pdptrs_changed(struct kvm_vcpu *vcpu) -{ - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - int offset; - gfn_t gfn; - int r; - - if (!is_pae_paging(vcpu)) - return false; - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - return true; - - gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); - if (r < 0) - return true; - - return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -} -EXPORT_SYMBOL_GPL(pdptrs_changed); - void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; - if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); } - if ((cr0 ^ old_cr0) & update_bits) + if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && @@ -1038,10 +1035,7 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - - if (((cr4 ^ old_cr4) & mmu_role_bits) || + if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); } @@ -1084,25 +1078,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } EXPORT_SYMBOL_GPL(kvm_set_cr4); +static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + unsigned long roots_to_free = 0; + int i; + + /* + * If neither the current CR3 nor any of the prev_roots use the given + * PCID, then nothing needs to be done here because a resync will + * happen anyway before switching to any other CR3. + */ + if (kvm_get_active_pcid(vcpu) == pcid) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); +} + int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { bool skip_tlb_flush = false; + unsigned long pcid = 0; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); if (pcid_enabled) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; + pcid = cr3 & X86_CR3_PCID_MASK; } #endif - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - if (!skip_tlb_flush) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - return 0; - } + /* PDPTRs are always reloaded for PAE paging. */ + if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) + goto handle_tlb_flush; /* * Do not condition the GPA check on long mode, this helper is used to @@ -1115,10 +1130,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); + if (cr3 != kvm_read_cr3(vcpu)) + kvm_mmu_new_pgd(vcpu, cr3); + vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); +handle_tlb_flush: + /* + * A load of CR3 that flushes the TLB flushes only the current PCID, + * even if PCID is disabled, in which case PCID=0 is flushed. It's a + * moot point in the end because _disabling_ PCID will flush all PCIDs, + * and it's impossible to use a non-zero PCID when PCID is disabled, + * i.e. only PCID=0 can be relevant. + */ + if (!skip_tlb_flush) + kvm_invalidate_pcid(vcpu, pcid); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -2179,13 +2207,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); + static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return 0; } @@ -2211,7 +2241,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return -1; } - vcpu->arch.tsc_scaling_ratio = ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } @@ -2223,7 +2253,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return -1; } @@ -2305,10 +2335,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) { u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; if (ratio != kvm_default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); @@ -2317,25 +2346,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) } EXPORT_SYMBOL_GPL(kvm_scale_tsc); -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + + kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { - vcpu->arch.l1_tsc_offset = offset; - vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); + u64 nested_offset; + + if (l2_multiplier == kvm_default_tsc_scaling_ratio) + nested_offset = l1_offset; + else + nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + nested_offset += l2_offset; + return nested_offset; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); + +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) +{ + if (l2_multiplier != kvm_default_tsc_scaling_ratio) + return mul_u64_u64_shr(l1_multiplier, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + return l1_multiplier; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); + +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) +{ + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.l1_tsc_offset, + l1_offset); + + vcpu->arch.l1_tsc_offset = l1_offset; + + /* + * If we are here because L1 chose not to trap WRMSR to TSC then + * according to the spec this should set L1's TSC (as opposed to + * setting L1's offset for L2). + */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + l1_offset, + static_call(kvm_x86_get_l2_tsc_offset)(vcpu), + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_offset = l1_offset; + + static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); +} + +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) +{ + vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; + + /* Userspace is changing the multiplier while L2 is active */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + l1_multiplier, + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_scaling_ratio = l1_multiplier; + + if (kvm_has_tsc_control) + static_call(kvm_x86_write_tsc_multiplier)( + vcpu, vcpu->arch.tsc_scaling_ratio); } static inline bool kvm_check_tsc_unstable(void) @@ -2361,7 +2451,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -2400,7 +2490,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -2459,9 +2549,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -2844,7 +2935,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, @@ -3250,7 +3342,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, data); } else { - u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } @@ -3552,10 +3644,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ - u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : - vcpu->arch.tsc_offset; + u64 offset, ratio; - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; + if (msr_info->host_initiated) { + offset = vcpu->arch.l1_tsc_offset; + ratio = vcpu->arch.l1_tsc_scaling_ratio; + } else { + offset = vcpu->arch.tsc_offset; + ratio = vcpu->arch.tsc_scaling_ratio; + } + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: @@ -3879,6 +3978,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: + case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -3909,8 +4009,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_SREGS2: + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: r = 1; break; + case KVM_CAP_EXIT_HYPERCALL: + r = KVM_EXIT_HYPERCALL_VALID_MASK; + break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; #ifdef CONFIG_KVM_XEN @@ -4138,7 +4243,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mark_tsc_unstable("KVM discovered backwards TSC"); if (kvm_check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, + u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; @@ -4457,7 +4562,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu); +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -4517,13 +4622,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { - if (events->smi.smm) - vcpu->arch.hflags |= HF_SMM_MASK; - else - vcpu->arch.hflags &= ~HF_SMM_MASK; - kvm_smm_changed(vcpu); - } + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) + kvm_smm_changed(vcpu, events->smi.smm); vcpu->arch.smi_pending = events->smi.pending; @@ -4807,6 +4907,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + case KVM_CAP_HYPERV_ENFORCE_CPUID: + return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; if (vcpu->arch.pv_cpuid.enforce) @@ -4825,6 +4928,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; union { + struct kvm_sregs2 *sregs2; struct kvm_lapic_state *lapic; struct kvm_xsave *xsave; struct kvm_xcrs *xcrs; @@ -5197,6 +5301,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } #endif + case KVM_GET_SREGS2: { + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); + r = -ENOMEM; + if (!u.sregs2) + goto out; + __get_sregs2(vcpu, u.sregs2); + r = -EFAULT; + if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS2: { + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); + if (IS_ERR(u.sregs2)) { + r = PTR_ERR(u.sregs2); + u.sregs2 = NULL; + goto out; + } + r = __set_sregs2(vcpu, u.sregs2); + break; + } default: r = -EINVAL; } @@ -5516,6 +5642,21 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_EXIT_HYPERCALL: + if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { + r = -EINVAL; + break; + } + kvm->arch.hypercall_exit_enabled = cap->args[0]; + r = 0; + break; + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + r = -EINVAL; + if (cap->args[0] & ~1) + break; + kvm->arch.exit_on_emulation_error = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -5630,6 +5771,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) return 0; } +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER +static int kvm_arch_suspend_notifier(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i, ret = 0; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!vcpu->arch.pv_time_enabled) + continue; + + ret = kvm_set_guest_paused(vcpu); + if (ret) { + kvm_err("Failed to pause guest VCPU%d: %d\n", + vcpu->vcpu_id, ret); + break; + } + } + mutex_unlock(&kvm->lock); + + return ret ? NOTIFY_BAD : NOTIFY_DONE; +} + +int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + return kvm_arch_suspend_notifier(kvm); + } + + return NOTIFY_DONE; +} +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -7104,23 +7280,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - vcpu->arch.hflags = emul_flags; - kvm_mmu_reset_context(vcpu); + kvm_smm_changed(vcpu, false); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); + return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); } -static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { - kvm_smm_changed(emul_to_vcpu(ctxt)); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) @@ -7169,9 +7344,9 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_fxsr = emulator_guest_has_fxsr, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .set_hflags = emulator_set_hflags, - .pre_leave_smm = emulator_pre_leave_smm, - .post_leave_smm = emulator_post_leave_smm, + .exiting_smm = emulator_exiting_smm, + .leave_smm = emulator_leave_smm, + .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, }; @@ -7277,8 +7452,33 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; + struct kvm_run *run = vcpu->run; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; + run->emulation_failure.ndata = 0; + run->emulation_failure.flags = 0; + + if (insn_size) { + run->emulation_failure.ndata = 3; + run->emulation_failure.flags |= + KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; + run->emulation_failure.insn_size = insn_size; + memset(run->emulation_failure.insn_bytes, 0x90, + sizeof(run->emulation_failure.insn_bytes)); + memcpy(run->emulation_failure.insn_bytes, + ctxt->fetch.data, insn_size); + } +} + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { + struct kvm *kvm = vcpu->kvm; + ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); @@ -7287,10 +7487,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } - if (emulation_type & EMULTYPE_SKIP) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (kvm->arch.exit_on_emulation_error || + (emulation_type & EMULTYPE_SKIP)) { + prepare_emulation_failure_exit(vcpu); return 0; } @@ -7432,11 +7631,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu) +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - if (!(vcpu->arch.hflags & HF_SMM_MASK)) { - /* This is a good place to trace that we are exiting SMM. */ - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -8361,16 +8563,15 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); -void kvm_apicv_init(struct kvm *kvm, bool enable) +static void kvm_apicv_init(struct kvm *kvm) { - if (enable) + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); else set_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); } -EXPORT_SYMBOL_GPL(kvm_apicv_init); static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { @@ -8406,6 +8607,17 @@ no_yield: return; } +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + u64 ret = vcpu->run->hypercall.ret; + + if (!is_64_bit_mode(vcpu)) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -8471,6 +8683,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_sched_yield(vcpu, a0); ret = 0; break; + case KVM_HC_MAP_GPA_RANGE: { + u64 gpa = a0, npages = a1, attrs = a2; + + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + if (!PAGE_ALIGNED(gpa) || !npages || + gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) { + ret = -KVM_EINVAL; + break; + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = attrs; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + } default: ret = -KVM_ENOSYS; break; @@ -8554,9 +8788,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) int kvm_check_nested_events(struct kvm_vcpu *vcpu) { - if (WARN_ON_ONCE(!is_guest_mode(vcpu))) - return -EIO; - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { kvm_x86_ops.nested_ops->triple_fault(vcpu); return 1; @@ -8572,7 +8803,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) static_call(kvm_x86_queue_exception)(vcpu); } -static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; bool can_inject = true; @@ -8619,7 +8850,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) - goto busy; + goto out; } /* try to inject new event if pending */ @@ -8661,7 +8892,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (vcpu->arch.smi_pending) { r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; @@ -8674,7 +8905,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (vcpu->arch.nmi_pending) { r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; @@ -8689,7 +8920,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (kvm_cpu_has_injectable_intr(vcpu)) { r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); static_call(kvm_x86_set_irq)(vcpu); @@ -8705,11 +8936,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit *req_immediate_exit = true; WARN_ON(vcpu->arch.exception.pending); - return; + return 0; -busy: - *req_immediate_exit = true; - return; +out: + if (r == -EBUSY) { + *req_immediate_exit = true; + r = 0; + } + return r; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -8888,10 +9122,9 @@ static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; + unsigned long cr0; char buf[512]; - u32 cr0; - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) @@ -8901,13 +9134,13 @@ static void enter_smm(struct kvm_vcpu *vcpu) enter_smm_save_state_32(vcpu, buf); /* - * Give pre_enter_smm() a chance to make ISA-specific changes to the - * vCPU state (e.g. leave guest mode) after we've saved the state into - * the SMM state-save area. + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. */ - static_call(kvm_x86_pre_enter_smm)(vcpu, buf); + static_call(kvm_x86_enter_smm)(vcpu, buf); - vcpu->arch.hflags |= HF_SMM_MASK; + kvm_smm_changed(vcpu, true); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (static_call(kvm_x86_get_nmi_mask)(vcpu)) @@ -8996,6 +9229,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + + /* + * When APICv gets disabled, we may still have injected interrupts + * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was + * still active when the interrupt got accepted. Make sure + * inject_pending_event() is called to check for that. + */ + if (!vcpu->arch.apicv_active) + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -9171,7 +9413,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@ -9264,13 +9506,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; - kvm_apic_accept_events(vcpu); + r = kvm_apic_accept_events(vcpu); + if (r < 0) { + r = 0; + goto out; + } if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; goto out; } - inject_pending_event(vcpu, &req_immediate_exit); + r = inject_pending_event(vcpu, &req_immediate_exit); + if (r < 0) { + r = 0; + goto out; + } if (req_int_win) static_call(kvm_x86_enable_irq_window)(vcpu); @@ -9472,7 +9722,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) return 1; } - kvm_apic_accept_events(vcpu); + if (kvm_apic_accept_events(vcpu) < 0) + return 0; switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: @@ -9696,7 +9947,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } kvm_vcpu_block(vcpu); - kvm_apic_accept_events(vcpu); + if (kvm_apic_accept_events(vcpu) < 0) { + r = 0; + goto out; + } kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; if (signal_pending(current)) { @@ -9845,7 +10099,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) } EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -9878,14 +10132,36 @@ skip_protected_regs: sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); +} + +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + __get_sregs_common(vcpu, sregs); - memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); + if (vcpu->arch.guest_state_protected) + return; if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); } +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int i; + + __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); + + if (vcpu->arch.guest_state_protected) + return; + + if (is_pae_paging(vcpu)) { + for (i = 0 ; i < 4 ; i++) + sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); + sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + } +} + int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -9898,11 +10174,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int r; + vcpu_load(vcpu); if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); - kvm_apic_accept_events(vcpu); + r = kvm_apic_accept_events(vcpu); + if (r < 0) + goto out; + r = 0; + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && vcpu->arch.pv.pv_unhalted) @@ -9910,10 +10192,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; +out: if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); - return 0; + return r; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, @@ -9997,24 +10280,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvm_is_valid_cr4(vcpu, sregs->cr4); } -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, + int *mmu_reset_needed, bool update_pdptrs) { struct msr_data apic_base_msr; - int mmu_reset_needed = 0; - int pending_vec, max_bits, idx; + int idx; struct desc_ptr dt; - int ret = -EINVAL; if (!kvm_is_valid_sregs(vcpu, sregs)) - goto out; + return -EINVAL; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) - goto out; + return -EINVAL; if (vcpu->arch.guest_state_protected) - goto skip_protected_regs; + return 0; dt.size = sregs->idt.limit; dt.address = sregs->idt.base; @@ -10024,31 +10306,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) static_call(kvm_x86_set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; static_call(kvm_x86_set_efer)(vcpu, sregs->efer); - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); - mmu_reset_needed = 1; + if (update_pdptrs) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (is_pae_paging(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + *mmu_reset_needed = 1; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -10068,20 +10349,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -skip_protected_regs: + return 0; +} + +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int pending_vec, max_bits; + int mmu_reset_needed = 0; + int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); + + if (ret) + return ret; + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); + kvm_make_request(KVM_REQ_EVENT, vcpu); } + return 0; +} - kvm_make_request(KVM_REQ_EVENT, vcpu); +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int mmu_reset_needed = 0; + bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; + bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && + !(sregs2->efer & EFER_LMA); + int i, ret; - ret = 0; -out: - return ret; + if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) + return -EINVAL; + + if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) + return -EINVAL; + + ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, + &mmu_reset_needed, !valid_pdptrs); + if (ret) + return ret; + + if (valid_pdptrs) { + for (i = 0; i < 4 ; i++) + kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); + + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + mmu_reset_needed = 1; + vcpu->arch.pdptrs_from_userspace = true; + } + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -10305,13 +10629,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; + vcpu->arch.last_vmentry_cpu = -1; + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) return r; @@ -10371,6 +10695,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; vcpu->arch.preempted_in_kernel = false; +#if IS_ENABLED(CONFIG_HYPERV) + vcpu->arch.hv_root_tdp = INVALID_PAGE; +#endif + r = static_call(kvm_x86_vcpu_create)(vcpu); if (r) goto free_guest_fpu; @@ -10379,8 +10707,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); + kvm_set_tsc_khz(vcpu, max_tsc_khz); kvm_vcpu_reset(vcpu, false); - kvm_init_mmu(vcpu, false); + kvm_init_mmu(vcpu); vcpu_put(vcpu); return 0; @@ -10454,6 +10783,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + kvm_lapic_reset(vcpu, init_event); vcpu->arch.hflags = 0; @@ -10522,6 +10853,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + + /* + * Reset the MMU context if paging was enabled prior to INIT (which is + * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the + * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be + * checked because it is unconditionally cleared on INIT and all other + * paging related bits are ignored if paging is disabled, i.e. CR0.WP, + * CR4, and EFER changes are all irrelevant if CR0.PG was '0'. + */ + if (old_cr0 & X86_CR0_PG) + kvm_mmu_reset_context(vcpu); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -10639,6 +10981,9 @@ int kvm_arch_hardware_setup(void *opaque) int r; rdmsrl_safe(MSR_EFER, &host_efer); + if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) && + !(host_efer & EFER_NX))) + return -EIO; if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); @@ -10754,9 +11099,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.guest_can_read_msr_platform_info = true; +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&kvm->arch.hv_root_tdp_lock); + kvm->arch.hv_root_tdp = INVALID_PAGE; +#endif + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); @@ -10917,17 +11268,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +static void memslot_rmap_free(struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; + } +} - if (i == 0) - continue; +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + int i; + memslot_rmap_free(slot); + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -10935,11 +11292,79 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, - unsigned long npages) +static int memslot_rmap_alloc(struct kvm_memory_slot *slot, + unsigned long npages) { + const int sz = sizeof(*slot->arch.rmap[0]); int i; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + int level = i + 1; + int lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + WARN_ON(slot->arch.rmap[i]); + + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + if (!slot->arch.rmap[i]) { + memslot_rmap_free(slot); + return -ENOMEM; + } + } + + return 0; +} + +int alloc_all_memslots_rmaps(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r, i; + + /* + * Check if memslots alreday have rmaps early before acquiring + * the slots_arch_lock below. + */ + if (kvm_memslots_have_rmaps(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Read memslots_have_rmaps again, under the slots arch lock, + * before allocating the rmaps + */ + if (kvm_memslots_have_rmaps(kvm)) { + mutex_unlock(&kvm->slots_arch_lock); + return 0; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + r = memslot_rmap_alloc(slot, slot->npages); + if (r) { + mutex_unlock(&kvm->slots_arch_lock); + return r; + } + } + } + + /* + * Ensure that memslots_have_rmaps becomes true strictly after + * all the rmap pointers are set. + */ + smp_store_release(&kvm->arch.memslots_have_rmaps, true); + mutex_unlock(&kvm->slots_arch_lock); + return 0; +} + +static int kvm_alloc_memslot_metadata(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages) +{ + int i, r; + /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The * old arrays will be freed by __kvm_set_memory_region() if installing @@ -10947,7 +11372,13 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + if (kvm_memslots_have_rmaps(kvm)) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + } + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; @@ -10956,14 +11387,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap[i] = - kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -10993,12 +11416,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -11027,7 +11447,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(memslot, + return kvm_alloc_memslot_metadata(kvm, memslot, mem->memory_size >> PAGE_SHIFT); return 0; } @@ -11103,36 +11523,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ kvm_mmu_zap_collapsible_sptes(kvm, new); } else { - /* By default, write-protect everything to log writes. */ - int level = PG_LEVEL_4K; + /* + * Initially-all-set does not require write protecting any page, + * because they're all assumed to be dirty. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; if (kvm_x86_ops.cpu_dirty_log_size) { - /* - * Clear all dirty bits, unless pages are treated as - * dirty from the get-go. - */ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) - kvm_mmu_slot_leaf_clear_dirty(kvm, new); - - /* - * Write-protect large pages on write so that dirty - * logging happens at 4k granularity. No need to - * write-protect small SPTEs since write accesses are - * logged by the CPU via dirty bits. - */ - level = PG_LEVEL_2M; - } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { - /* - * If we're with initial-all-set, we don't need - * to write protect any small page because - * they're reported as dirty already. However - * we still need to write-protect huge pages - * so that the page split can happen lazily on - * the first write to the huge page. - */ - level = PG_LEVEL_2M; + kvm_mmu_slot_leaf_clear_dirty(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); + } else { + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } - kvm_mmu_slot_remove_write_access(kvm, new, level); } } @@ -11701,8 +12104,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { bool pcid_enabled; struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; @@ -11736,23 +12137,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - + kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); case INVPCID_TYPE_ALL_NON_GLOBAL: @@ -11765,7 +12150,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return kvm_skip_emulated_instruction(vcpu); default: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 521f74e5bbf2..44ae10312740 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -157,16 +157,6 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) return cs_l; } -static inline bool is_la57_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return (vcpu->arch.efer & EFER_LMA) && - kvm_read_cr4_bits(vcpu, X86_CR4_LA57); -#else - return 0; -#endif -} - static inline bool x86_exception_has_error_code(unsigned int vector) { static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index a67afd74232c..a1d24fdc07cf 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1417,7 +1417,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } -static unsigned long insn_get_effective_ip(struct pt_regs *regs) +static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip) { unsigned long seg_base = 0; @@ -1430,10 +1430,12 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs) if (!user_64bit_mode(regs)) { seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); if (seg_base == -1L) - return 0; + return -EINVAL; } - return seg_base + regs->ip; + *ip = seg_base + regs->ip; + + return 0; } /** @@ -1446,18 +1448,17 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs) * * Returns: * - * Number of instruction bytes copied. - * - * 0 if nothing was copied. + * - number of instruction bytes copied. + * - 0 if nothing was copied. + * - -EINVAL if the linear address of the instruction could not be calculated */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { unsigned long ip; int not_copied; - ip = insn_get_effective_ip(regs); - if (!ip) - return 0; + if (insn_get_effective_ip(regs, &ip)) + return -EINVAL; not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); @@ -1475,18 +1476,17 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) * * Returns: * - * Number of instruction bytes copied. - * - * 0 if nothing was copied. + * - number of instruction bytes copied. + * - 0 if nothing was copied. + * - -EINVAL if the linear address of the instruction could not be calculated. */ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { unsigned long ip; int not_copied; - ip = insn_get_effective_ip(regs); - if (!ip) - return 0; + if (insn_get_effective_ip(regs, &ip)) + return -EINVAL; not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 4d32cb06ffd5..ec9922cba30a 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -58,12 +58,16 @@ SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) 2: .skip 5-(2b-1b), 0x90 SYM_FUNC_END(__x86_indirect_alt_call_\reg) +STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) + SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) ANNOTATE_RETPOLINE_SAFE 1: jmp *%\reg 2: .skip 5-(2b-1b), 0x90 SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) +STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) + .endm /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6bda7f67d737..2d27932c9ac7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1186,7 +1186,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, return; /* kprobes don't want to hook the spurious faults: */ - if (kprobe_page_fault(regs, X86_TRAP_PF)) + if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* @@ -1239,7 +1239,7 @@ void do_user_addr_fault(struct pt_regs *regs, } /* kprobes don't want to hook the spurious faults: */ - if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) + if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 21ffb03f6c72..74b78840182d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -651,7 +651,7 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -677,7 +677,7 @@ void __init initmem_init(void) setup_bootmem_allocator(); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void __init setup_bootmem_allocator(void) { diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index a2332eef66e9..4a67b922bce1 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -192,6 +192,10 @@ static const struct file_operations fops_init_pkru = { static int __init create_init_pkru_value(void) { + /* Do not expose the file if pkeys are not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return 0; + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_init_pkru); return 0; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 78804680e923..cfe6b1e85fa6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,6 +14,7 @@ #include <asm/nospec-branch.h> #include <asm/cache.h> #include <asm/apic.h> +#include <asm/perf_event.h> #include "mm_internal.h" @@ -404,9 +405,14 @@ static inline void cr4_update_pce_mm(struct mm_struct *mm) { if (static_branch_unlikely(&rdpmc_always_available_key) || (!static_branch_unlikely(&rdpmc_never_available_key) && - atomic_read(&mm->context.perf_rdpmc_allowed))) + atomic_read(&mm->context.perf_rdpmc_allowed))) { + /* + * Clear the existing dirty counters to + * prevent the leak for an RDPMC task. + */ + perf_clear_dirty_counters(); cr4_set_bits_irqsoff(X86_CR4_PCE); - else + } else cr4_clear_bits_irqsoff(X86_CR4_PCE); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2a2e290fa5d8..e835164189f1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -31,7 +31,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) } #define EMIT(bytes, len) \ - do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) + do { prog = emit_code(prog, bytes, len); } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -239,7 +239,6 @@ struct jit_context { static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[0]) EMIT1(0x53); /* push rbx */ @@ -255,7 +254,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[3]) EMIT2(0x41, 0x5F); /* pop r15 */ @@ -277,13 +275,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; - int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, x86_nops[5], cnt); - prog += cnt; + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) EMIT2(0x31, 0xC0); /* xor eax, eax */ @@ -303,7 +300,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + X86_PATCH_SIZE); @@ -423,7 +419,6 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, int off1 = 42; int off2 = 31; int off3 = 9; - int cnt = 0; /* count the additional bytes used for popping callee regs from stack * that need to be taken into account for each of the offsets that @@ -513,7 +508,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int pop_bytes = 0; int off1 = 20; int poke_off; - int cnt = 0; /* count the additional bytes used for popping callee regs to stack * that need to be taken into account for jump offset that is used for @@ -615,7 +609,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, { u8 *prog = *pprog; u8 b1, b2, b3; - int cnt = 0; /* * Optimization: if imm32 is positive, use 'mov %eax, imm32' @@ -655,7 +648,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { u8 *prog = *pprog; - int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { /* @@ -678,7 +670,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) { u8 *prog = *pprog; - int cnt = 0; if (is64) { /* mov dst, src */ @@ -697,7 +688,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) { u8 *prog = *pprog; - int cnt = 0; if (is_imm8(off)) { /* 1-byte signed displacement. @@ -720,7 +710,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) { u8 *prog = *pprog; - int cnt = 0; if (is64) EMIT1(add_2mod(0x48, dst_reg, src_reg)); @@ -733,7 +722,6 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -764,7 +752,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -799,7 +786,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; - int cnt = 0; EMIT1(0xF0); /* lock prefix */ @@ -869,10 +855,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } -static int emit_nops(u8 **pprog, int len) +static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; - int i, noplen, cnt = 0; + int i, noplen; while (len > 0) { noplen = len; @@ -886,8 +872,6 @@ static int emit_nops(u8 **pprog, int len) } *pprog = prog; - - return cnt; } #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) @@ -902,7 +886,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0, excnt = 0; + int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; @@ -1297,7 +1281,7 @@ st: if (is_imm8(insn->off)) emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; - u8 *_insn = image + proglen; + u8 *_insn = image + proglen + (start_of_ldx - temp); s64 delta; /* populate jmp_offset for JMP above */ @@ -1576,7 +1560,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1622,7 +1606,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } break; } @@ -1647,7 +1631,7 @@ emit_jmp: nops); return -EFAULT; } - cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + emit_nops(&prog, INSN_SZ_DIFF - 2); } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1754,7 +1738,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, { u8 *prog = *pprog; u8 *jmp_insn; - int cnt = 0; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1822,7 +1805,6 @@ static void emit_align(u8 **pprog, u32 align) static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + 2 + 4); @@ -1854,7 +1836,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, u8 **branches) { u8 *prog = *pprog; - int i, cnt = 0; + int i; /* The first fmod_ret program will receive a garbage return value. * Set this to 0 to avoid confusing the program. @@ -1950,7 +1932,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *tprogs, void *orig_call) { - int ret, i, cnt = 0, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; @@ -2095,8 +2077,6 @@ static int emit_fallback_jump(u8 **pprog) */ err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); #else - int cnt = 0; - EMIT2(0xFF, 0xE2); /* jmp rdx */ #endif *pprog = prog; @@ -2106,7 +2086,7 @@ static int emit_fallback_jump(u8 **pprog) static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; - int pivot, err, jg_bytes = 1, cnt = 0; + int pivot, err, jg_bytes = 1; s64 jg_offset; if (a == b) { diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 8a26e705cb06..147c30a81f15 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -468,7 +468,7 @@ void __init efi_init(void) */ if (!efi_runtime_supported()) - pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); + pr_err("No EFI runtime due to 32/64-bit mismatch with kernel\n"); if (!efi_runtime_supported() || efi_runtime_disabled()) { efi_memmap_unmap(); diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index 6b1f3a4eeb44..a0b491ae2de8 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -10,7 +10,6 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y subdir- := rm diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 2ed81e581755..0575decb5e54 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <asm/unistd.h> #include <asm/syscall.h> #define __NO_STUBS @@ -26,20 +25,17 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include <asm/syscalls_32.h> -#undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 2e8544dafbb0..95725b5a41ac 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <asm/unistd.h> #include <asm/syscall.h> #define __NO_STUBS @@ -36,23 +35,15 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include <asm/syscalls_64.h> -#undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_64.h> }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index e87699aa2dc8..03149422dce2 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -592,8 +592,10 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug) DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap) { /* This should never happen and there is no way to handle it. */ + instrumentation_begin(); pr_err("Unknown trap in Xen PV mode."); BUG(); + instrumentation_end(); } #ifdef CONFIG_X86_MCE |