diff options
Diffstat (limited to 'arch/x86')
469 files changed, 17295 insertions, 16582 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fbf26e0f7a6a..2792879d398e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,10 +14,12 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW + select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION - select GENERIC_VDSO_32 + select ARCH_SPLIT_ARG64 config X86_64 def_bool y @@ -30,6 +32,7 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB + select ARCH_HAS_ELFCORE_COMPAT config FORCE_DYNAMIC_FTRACE def_bool y @@ -91,7 +94,11 @@ config X86 select ARCH_STACKWALK select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_LTO_CLANG if X86_64 + select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -109,7 +116,6 @@ config X86 select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE @@ -145,6 +151,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT @@ -163,11 +170,14 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT + select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -182,6 +192,7 @@ config X86 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP @@ -199,8 +210,8 @@ config X86 select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD + select HAVE_MOVE_PUD select HAVE_NMI - select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS @@ -214,10 +225,12 @@ config X86 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_PREEMPT_DYNAMIC select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -330,9 +343,6 @@ config ZONE_DMA32 config AUDIT_ARCH def_bool y if X86_64 -config ARCH_SUPPORTS_DEBUG_PAGEALLOC - def_bool y - config KASAN_SHADOW_OFFSET hex depends on KASAN @@ -441,7 +451,7 @@ config X86_X2APIC If you don't know what to do here, say N. config X86_MPPARSE - bool "Enable MPS table" if ACPI || SFI + bool "Enable MPS table" if ACPI default y depends on X86_LOCAL_APIC help @@ -600,7 +610,6 @@ config X86_INTEL_MID depends on PCI depends on X86_64 || (PCI_GOANY && X86_32) depends on X86_IO_APIC - select SFI select I2C select DW_APB_TIMER select APB_TIMER @@ -887,19 +896,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) - -config APB_TIMER - def_bool y if X86_INTEL_MID - prompt "Intel MID APB Timer Support" if X86_INTEL_MID - select DW_APB_TIMER - depends on X86_INTEL_MID && SFI - help - APB timer is the replacement for 8254, HPET on X86 MID platforms. - The APBT provides a stable time base on SMP - systems, unlike the TSC, but it is more expensive to access, - as it is off-chip. APB timers are always running regardless of CPU - C states, they are used as per CPU clockevent device when possible. + depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) # Mark as expert because too many people got it wrong. # The code disables itself when not needed. @@ -1155,10 +1152,6 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_THERMAL_VECTOR - def_bool y - depends on X86_MCE_INTEL - source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 @@ -1931,6 +1924,23 @@ config X86_INTEL_TSX_MODE_AUTO side channel attacks- equals the tsx=auto command line parameter. endchoice +config X86_SGX + bool "Software Guard eXtensions (SGX)" + depends on X86_64 && CPU_SUP_INTEL + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + select SRCU + select MMU_NOTIFIER + help + Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions + that can be used by applications to set aside private regions of code + and data, referred to as enclaves. An enclave's private memory can + only be accessed by code running within the enclave. Accesses from + outside the enclave, including other enclaves, are disallowed by + hardware. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI @@ -2449,8 +2459,6 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" -source "drivers/sfi/Kconfig" - config X86_APM_BOOT def_bool y depends on APM @@ -2637,7 +2645,7 @@ config PCI_DIRECT config PCI_MMCONFIG bool "Support mmconfig PCI config space access" if X86_64 default y - depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST) + depends on PCI && (ACPI || JAILHOUSE_GUEST) depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG) config PCI_OLPC @@ -2844,7 +2852,6 @@ config IA32_EMULATION depends on X86_64 select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF - select COMPAT_BINFMT_ELF select COMPAT_OLD_SIGACTION help Include code to run legacy 32-bit programs under a diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 27b5e2bc6a01..80b57e7f4947 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -62,9 +62,6 @@ config EARLY_PRINTK_USB_XDBC You should normally say N here, unless you want to debug early crashes or need a very simple printk logging facility. -config COPY_MC_TEST - def_bool n - config EFI_PGT_DUMP bool "Dump the EFI pagetable" depends on EFI diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1bf21746f4ce..9a85eae37b17 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -24,17 +24,10 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -# -# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, include an *assembly* header to make sure that -# gcc doesn't play any games behind our back. -CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h -M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) - -REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \ +REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector @@ -57,6 +50,9 @@ export BITS KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +# Intel CET isn't enabled in the kernel +KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -173,6 +169,11 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif +ifdef CONFIG_LTO_CLANG +KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ + -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +endif + # Workaround for a gcc prelease that unfortunately was shipped in a suse release KBUILD_CFLAGS += -Wno-sign-compare # @@ -236,9 +237,6 @@ core-y += arch/x86/ drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ drivers-$(CONFIG_PCI) += arch/x86/pci/ -# must be linked after kernel/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ - # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ @@ -299,16 +297,20 @@ archclean: $(Q)$(MAKE) $(clean)=arch/x86/tools define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/$(INSTALLKERNEL) or' - echo ' (distribution) /sbin/$(INSTALLKERNEL) or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' - echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' - echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' - echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' - echo ' bzdisk/fdimage*/isoimage also accept:' - echo ' FDARGS="..." arguments for the booted kernel' - echo ' FDINITRD=file initrd for the booted kernel' + echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' + echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to ' + echo ' $$(INSTALL_PATH) and run lilo' + echo '' + echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' + echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' + echo '' + echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' + echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' + endef diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h deleted file mode 100644 index e19fd7536307..000000000000 --- a/arch/x86/boot/code16gcc.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -# -# code16gcc.h -# -# This file is added to the assembler via -Wa when compiling 16-bit C code. -# This is done this way instead via asm() to make sure gcc does not reorder -# things around us. -# -# gcc 4.9+ has a real -m16 option so we can drop this hack long term. -# - - .code16gcc diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 40b8fd375d52..e0bc3988c3fa 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse -KBUILD_CFLAGS += -ffreestanding +KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 017de6cc87dc..e94874f4bbc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -241,12 +241,12 @@ SYM_FUNC_START(startup_32) leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED movl rva(efi32_boot_args)(%ebp), %edi - cmp $0, %edi + testl %edi, %edi jz 1f leal rva(efi64_stub_entry)(%ebp), %eax movl rva(efi32_boot_args+4)(%ebp), %esi movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer - cmpl $0, %edx + testl %edx, %edx jnz 1f /* * efi_pe_entry uses MS calling convention, which requires 32 bytes of @@ -592,7 +592,7 @@ SYM_CODE_START(trampoline_32bit_src) movl %eax, %cr0 /* Check what paging mode we want to be in after the trampoline */ - cmpl $0, %edx + testl %edx, %edx jz 1f /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ @@ -622,7 +622,7 @@ SYM_CODE_START(trampoline_32bit_src) /* Enable PAE and LA57 (if required) paging modes */ movl $X86_CR4_PAE, %eax - cmpl $0, %edx + testl %edx, %edx jz 1f orl $X86_CR4_LA57, %eax 1: diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 39b2eded7bc2..f7213d0943b8 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -168,16 +168,6 @@ void initialize_identity_maps(void *rmode) write_cr3(top_level_pgt); } -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} - static pte_t *split_large_pmd(struct x86_mapping_info *info, pmd_t *pmdp, unsigned long __address) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index d9a631c5973c..901ea5ebec22 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,7 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#undef CONFIG_KASAN_GENERIC /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 78210793d357..9c9c4a888b1d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -50,7 +50,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_UNUSED_SYMBOLS is not set CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9936528e1939..b60bd2d86034 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_UNUSED_SYMBOLS is not set CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a31de0c6ccde..b28e36b7c96b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -4,8 +4,6 @@ OBJECT_FILES_NON_STANDARD := y -obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o - obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c deleted file mode 100644 index 7b7dc05fa1a4..000000000000 --- a/arch/x86/crypto/aes_glue.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 1852b19a73a0..4e3972570916 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -43,10 +43,6 @@ #ifdef __x86_64__ # constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 -.align 16 -.Lgf128mul_x_ble_mask: - .octa 0x00000000000000010000000000000087 .section .rodata.cst16.POLY, "aM", @progbits, 16 .align 16 POLY: .octa 0xC2000000000000000000000000000001 @@ -146,7 +142,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define CTR %xmm11 #define INC %xmm12 -#define GF128MUL_MASK %xmm10 +#define GF128MUL_MASK %xmm7 #ifdef __x86_64__ #define AREG %rax @@ -318,7 +314,7 @@ _initial_blocks_\@: # Main loop - Encrypt/Decrypt remaining blocks - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left_\@ sub $64, %r13 je _four_cipher_left_\@ @@ -437,7 +433,7 @@ _multiple_of_16_bytes_\@: mov PBlockLen(%arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 @@ -474,7 +470,7 @@ _T_8_\@: add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_4_\@: movd %xmm0, %eax @@ -482,7 +478,7 @@ _T_4_\@: add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_123_\@: movd %xmm0, %eax @@ -619,7 +615,7 @@ _get_AAD_blocks\@: /* read the last <16B of AAD */ _get_AAD_rest\@: - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 @@ -640,7 +636,7 @@ _get_AAD_done\@: .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH operation mov PBlockLen(%arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -692,7 +688,7 @@ _no_extra_mask_1_\@: pshufb %xmm2, %xmm3 pxor %xmm3, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -727,7 +723,7 @@ _no_extra_mask_2_\@: pshufb %xmm2, %xmm9 pxor %xmm9, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -747,7 +743,7 @@ _encode_done_\@: pshufb %xmm2, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 @@ -2577,13 +2573,140 @@ SYM_FUNC_START(aesni_cbc_dec) ret SYM_FUNC_END(aesni_cbc_dec) -#ifdef __x86_64__ +/* + * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_enc) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + movups (IVP), STATE + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + movups (IVP), %xmm5 + + movups (INP), IN1 + add LEN, INP + movups (INP), IN2 + + pxor IN1, STATE + call _aesni_enc1 + + pshufb %xmm5, IN2 + pxor STATE, IN2 + pshufb %xmm4, STATE + add OUTP, LEN + movups STATE, (LEN) + + movaps IN2, STATE + call _aesni_enc1 + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + ret +SYM_FUNC_END(aesni_cts_cbc_enc) + +/* + * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_dec) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + add $240, KEYP + movups (IVP), IV + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + + movups (INP), STATE + add LEN, INP + movups (INP), IN1 + + call _aesni_dec1 + movaps STATE, IN2 + pshufb %xmm4, STATE + pxor IN1, STATE + + add OUTP, LEN + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + call _aesni_dec1 + + pxor IV, STATE + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + ret +SYM_FUNC_END(aesni_cts_cbc_dec) + .pushsection .rodata .align 16 +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +#ifdef __x86_64__ .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#endif .popsection +#ifdef __x86_64__ /* * _aesni_inc_init: internal ABI * setup registers used by _aesni_inc @@ -2696,6 +2819,14 @@ SYM_FUNC_START(aesni_ctr_enc) ret SYM_FUNC_END(aesni_ctr_enc) +#endif + +.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 +.previous + /* * _aesni_gf128mul_x_ble: internal ABI * Multiply in GF(2^128) for XTS IVs @@ -2708,120 +2839,325 @@ SYM_FUNC_END(aesni_ctr_enc) * CTR: == temporary value */ #define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, CTR; \ + pshufd $0x13, IV, KEY; \ paddq IV, IV; \ - psrad $31, CTR; \ - pand GF128MUL_MASK, CTR; \ - pxor CTR, IV; + psrad $31, KEY; \ + pand GF128MUL_MASK, KEY; \ + pxor KEY, IV; /* - * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, bool enc, le128 *iv) + * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) */ -SYM_FUNC_START(aesni_xts_crypt8) +SYM_FUNC_START(aesni_xts_encrypt) FRAME_BEGIN - cmpb $0, %cl - movl $0, %ecx - movl $240, %r10d - leaq _aesni_enc4, %r11 - leaq _aesni_dec4, %rax - cmovel %r10d, %ecx - cmoveq %rax, %r11 - +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif movups (IVP), IV mov 480(KEYP), KLEN - addq %rcx, KEYP + +.Lxts_enc_loop4: + sub $64, LEN + jl .Lxts_enc_1x movdqa IV, STATE1 - movdqu 0x00(INP), INC - pxor INC, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - movdqu 0x10(INP), INC - pxor INC, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - movdqu 0x20(INP), INC - pxor INC, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x30(INP), INC - pxor INC, STATE4 + movdqu 0x30(INP), IN + pxor IN, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC r11 + call _aesni_enc4 - movdqu 0x00(OUTP), INC - pxor INC, STATE1 + movdqu 0x00(OUTP), IN + pxor IN, STATE1 movdqu STATE1, 0x00(OUTP) + movdqu 0x10(OUTP), IN + pxor IN, STATE2 + movdqu STATE2, 0x10(OUTP) + + movdqu 0x20(OUTP), IN + pxor IN, STATE3 + movdqu STATE3, 0x20(OUTP) + + movdqu 0x30(OUTP), IN + pxor IN, STATE4 + movdqu STATE4, 0x30(OUTP) + _aesni_gf128mul_x_ble() - movdqa IV, STATE1 - movdqu 0x40(INP), INC - pxor INC, STATE1 - movdqu IV, 0x40(OUTP) - movdqu 0x10(OUTP), INC - pxor INC, STATE2 - movdqu STATE2, 0x10(OUTP) + add $64, INP + add $64, OUTP + test LEN, LEN + jnz .Lxts_enc_loop4 +.Lxts_enc_ret_iv: + movups IV, (IVP) + +.Lxts_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + ret + +.Lxts_enc_1x: + add $64, LEN + jz .Lxts_enc_ret_iv + sub $16, LEN + jl .Lxts_enc_cts4 + +.Lxts_enc_loop1: + movdqu (INP), STATE + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x50(INP), INC - pxor INC, STATE2 - movdqu IV, 0x50(OUTP) - movdqu 0x20(OUTP), INC - pxor INC, STATE3 - movdqu STATE3, 0x20(OUTP) + test LEN, LEN + jz .Lxts_enc_out + + add $16, INP + sub $16, LEN + jl .Lxts_enc_cts1 + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_enc_loop1 + +.Lxts_enc_out: + movdqu STATE, (OUTP) + jmp .Lxts_enc_ret_iv + +.Lxts_enc_cts4: + movdqa STATE4, STATE + sub $16, OUTP + +.Lxts_enc_cts1: +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE + + movups STATE, (OUTP) + jmp .Lxts_enc_ret +SYM_FUNC_END(aesni_xts_encrypt) + +/* + * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_decrypt) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif + movups (IVP), IV + + mov 480(KEYP), KLEN + add $240, KEYP + + test $15, LEN + jz .Lxts_dec_loop4 + sub $16, LEN + +.Lxts_dec_loop4: + sub $64, LEN + jl .Lxts_dec_1x + + movdqa IV, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 + movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x60(INP), INC - pxor INC, STATE3 - movdqu IV, 0x60(OUTP) + movdqa IV, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 + movdqu IV, 0x10(OUTP) - movdqu 0x30(OUTP), INC - pxor INC, STATE4 - movdqu STATE4, 0x30(OUTP) + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 + movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x70(INP), INC - pxor INC, STATE4 - movdqu IV, 0x70(OUTP) + movdqu 0x30(INP), IN + pxor IN, STATE4 + movdqu IV, 0x30(OUTP) - _aesni_gf128mul_x_ble() - movups IV, (IVP) + call _aesni_dec4 + + movdqu 0x00(OUTP), IN + pxor IN, STATE1 + movdqu STATE1, 0x00(OUTP) + + movdqu 0x10(OUTP), IN + pxor IN, STATE2 + movdqu STATE2, 0x10(OUTP) - CALL_NOSPEC r11 + movdqu 0x20(OUTP), IN + pxor IN, STATE3 + movdqu STATE3, 0x20(OUTP) - movdqu 0x40(OUTP), INC - pxor INC, STATE1 - movdqu STATE1, 0x40(OUTP) + movdqu 0x30(OUTP), IN + pxor IN, STATE4 + movdqu STATE4, 0x30(OUTP) - movdqu 0x50(OUTP), INC - pxor INC, STATE2 - movdqu STATE2, 0x50(OUTP) + _aesni_gf128mul_x_ble() - movdqu 0x60(OUTP), INC - pxor INC, STATE3 - movdqu STATE3, 0x60(OUTP) + add $64, INP + add $64, OUTP + test LEN, LEN + jnz .Lxts_dec_loop4 - movdqu 0x70(OUTP), INC - pxor INC, STATE4 - movdqu STATE4, 0x70(OUTP) +.Lxts_dec_ret_iv: + movups IV, (IVP) +.Lxts_dec_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif FRAME_END ret -SYM_FUNC_END(aesni_xts_crypt8) +.Lxts_dec_1x: + add $64, LEN + jz .Lxts_dec_ret_iv + +.Lxts_dec_loop1: + movdqu (INP), STATE + + add $16, INP + sub $16, LEN + jl .Lxts_dec_cts1 + + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + _aesni_gf128mul_x_ble() + + test LEN, LEN + jz .Lxts_dec_out + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_dec_loop1 + +.Lxts_dec_out: + movdqu STATE, (OUTP) + jmp .Lxts_dec_ret_iv + +.Lxts_dec_cts1: + movdqa IV, STATE4 + _aesni_gf128mul_x_ble() + + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 #endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor STATE4, STATE + call _aesni_dec1 + pxor STATE4, STATE + + movups STATE, (OUTP) + jmp .Lxts_dec_ret +SYM_FUNC_END(aesni_xts_decrypt) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 5fee47956f3b..2cf8e94d986a 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -369,7 +369,7 @@ _initial_num_blocks_is_0\@: _initial_blocks_encrypted\@: - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left\@ sub $128, %r13 @@ -528,7 +528,7 @@ _multiple_of_16_bytes\@: vmovdqu HashKey(arg2), %xmm13 mov PBlockLen(arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ #GHASH computation for the last <16 Byte block @@ -573,7 +573,7 @@ _T_8\@: add $8, %r10 sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_4\@: vmovd %xmm9, %eax @@ -581,7 +581,7 @@ _T_4\@: add $4, %r10 sub $4, %r11 vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_123\@: vmovd %xmm9, %eax @@ -625,7 +625,7 @@ _get_AAD_blocks\@: cmp $16, %r11 jge _get_AAD_blocks\@ vmovdqu \T8, \T7 - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ vpxor \T7, \T7, \T7 @@ -644,7 +644,7 @@ _get_AAD_rest8\@: vpxor \T1, \T7, \T7 jmp _get_AAD_rest8\@ _get_AAD_rest4\@: - cmp $0, %r11 + test %r11, %r11 jle _get_AAD_rest0\@ mov (%r10), %eax movq %rax, \T1 @@ -749,7 +749,7 @@ _done_read_partial_block_\@: .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH ENC_DEC mov PBlockLen(arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -801,7 +801,7 @@ _no_extra_mask_1_\@: vpshufb %xmm2, %xmm3, %xmm3 vpxor %xmm3, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -836,7 +836,7 @@ _no_extra_mask_2_\@: vpshufb %xmm2, %xmm9, %xmm9 vpxor %xmm9, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -856,7 +856,7 @@ _encode_done_\@: vpshufb %xmm2, %xmm9, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ad8a7188a2bf..2144e54a6c89 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -31,11 +31,10 @@ #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/jump_label.h> #include <linux/workqueue.h> #include <linux/spinlock.h> -#ifdef CONFIG_X86_64 -#include <asm/crypto/glue_helper.h> -#endif +#include <linux/static_call.h> #define AESNI_ALIGN 16 @@ -93,62 +92,25 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #define AVX_GEN2_OPTSIZE 640 #define AVX_GEN4_OPTSIZE 4096 +asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + #ifdef CONFIG_X86_64 -static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); - -asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, bool enc, le128 *iv); - -/* asmlinkage void aesni_gcm_enc() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Ciphertext output. Encrypt in-place is allowed. - * const u8 *in, Plaintext input - * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len), Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_enc(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -/* asmlinkage void aesni_gcm_dec() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Plaintext output. Decrypt in-place is allowed. - * const u8 *in, Ciphertext input - * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going - * to be 8 or 12 bytes - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len) Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_dec(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); +DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); /* Scatter / Gather routines, with args similar to above */ asmlinkage void aesni_gcm_init(void *ctx, @@ -167,24 +129,6 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -static const struct aesni_gcm_tfm_s { - void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len); - void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); - void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len); - void (*finalize)(void *ctx, struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); -} *aesni_gcm_tfm; - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { - .init = &aesni_gcm_init, - .enc_update = &aesni_gcm_enc_update, - .dec_update = &aesni_gcm_dec_update, - .finalize = &aesni_gcm_finalize, -}; - asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, @@ -214,25 +158,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { - .init = &aesni_gcm_init_avx_gen2, - .enc_update = &aesni_gcm_enc_update_avx_gen2, - .dec_update = &aesni_gcm_dec_update_avx_gen2, - .finalize = &aesni_gcm_finalize_avx_gen2, -}; - /* * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data @@ -256,24 +181,8 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { - .init = &aesni_gcm_init_avx_gen4, - .enc_update = &aesni_gcm_enc_update_avx_gen4, - .dec_update = &aesni_gcm_dec_update_avx_gen4, - .finalize = &aesni_gcm_finalize_avx_gen4, -}; +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) @@ -374,16 +283,16 @@ static int ecb_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -396,16 +305,16 @@ static int ecb_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -418,16 +327,16 @@ static int cbc_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -440,36 +349,133 @@ static int cbc_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } -#ifdef CONFIG_X86_64 -static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct skcipher_walk *walk) +static int cts_cbc_encrypt(struct skcipher_request *req) { - u8 *ctrblk = walk->iv; - u8 keystream[AES_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_encrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int cts_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_decrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); - aesni_enc(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); - crypto_inc(ctrblk, AES_BLOCK_SIZE); + return skcipher_walk_done(&walk, 0); } +#ifdef CONFIG_X86_64 static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv) { @@ -491,120 +497,36 @@ static int ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; struct skcipher_walk walk; unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); - while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); - nbytes &= AES_BLOCK_SIZE - 1; + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, + walk.iv); + nbytes &= ~AES_BLOCK_MASK; + + if (walk.nbytes == walk.total && nbytes > 0) { + aesni_enc(ctx, keystream, walk.iv); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes, + walk.src.virt.addr + walk.nbytes - nbytes, + keystream, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + nbytes = 0; + } + kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } - if (walk.nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - kernel_fpu_end(); - return err; } -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - keylen /= 2; - - /* first half of xts-key is for crypt */ - err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, - key, keylen); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, - key + keylen, keylen); -} - - -static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc); -} - -static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); -} - -static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - aesni_xts_crypt8(ctx, dst, src, true, iv); -} - -static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - aesni_xts_crypt8(ctx, dst, src, false, iv); -} - -static const struct common_glue_ctx aesni_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_enc8 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_enc } - } } -}; - -static const struct common_glue_ctx aesni_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_dec8 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_dec } - } } -}; - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - true); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -681,42 +603,35 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx) + u8 *iv, void *aes_ctx, u8 *auth_tag, + unsigned long auth_tag_len) { - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; - struct gcm_context_data data AESNI_ALIGN_ATTR; - struct scatter_walk dst_sg_walk = {}; + u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); + struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); unsigned long left = req->cryptlen; - unsigned long len, srclen, dstlen; struct scatter_walk assoc_sg_walk; - struct scatter_walk src_sg_walk; - struct scatterlist src_start[2]; - struct scatterlist dst_start[2]; - struct scatterlist *src_sg; - struct scatterlist *dst_sg; - u8 *src, *dst, *assoc; + struct skcipher_walk walk; + bool do_avx, do_avx2; u8 *assocmem = NULL; - u8 authTag[16]; + u8 *assoc; + int err; if (!enc) left -= auth_tag_len; - if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) - gcm_tfm = &aesni_gcm_tfm_avx_gen2; - if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) - gcm_tfm = &aesni_gcm_tfm_sse; + do_avx = (left >= AVX_GEN2_OPTSIZE); + do_avx2 = (left >= AVX_GEN4_OPTSIZE); /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE)) { + if (req->src->length >= assoclen && req->src->length) { scatterwalk_start(&assoc_sg_walk, req->src); assoc = scatterwalk_map(&assoc_sg_walk); } else { + gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, GFP_ATOMIC); + assocmem = kmalloc(assoclen, flags); if (unlikely(!assocmem)) return -ENOMEM; assoc = assocmem; @@ -724,62 +639,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); } - if (left) { - src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen); - scatterwalk_start(&src_sg_walk, src_sg); - if (req->src != req->dst) { - dst_sg = scatterwalk_ffwd(dst_start, req->dst, - req->assoclen); - scatterwalk_start(&dst_sg_walk, dst_sg); - } - } - kernel_fpu_begin(); - gcm_tfm->init(aes_ctx, &data, iv, - hash_subkey, assoc, assoclen); - if (req->src != req->dst) { - while (left) { - src = scatterwalk_map(&src_sg_walk); - dst = scatterwalk_map(&dst_sg_walk); - srclen = scatterwalk_clamp(&src_sg_walk, left); - dstlen = scatterwalk_clamp(&dst_sg_walk, left); - len = min(srclen, dstlen); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, &data, - dst, src, len); - else - gcm_tfm->dec_update(aes_ctx, &data, - dst, src, len); - } - left -= len; - - scatterwalk_unmap(src); - scatterwalk_unmap(dst); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_advance(&dst_sg_walk, len); - scatterwalk_done(&src_sg_walk, 0, left); - scatterwalk_done(&dst_sg_walk, 1, left); - } - } else { - while (left) { - dst = src = scatterwalk_map(&src_sg_walk); - len = scatterwalk_clamp(&src_sg_walk, left); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, &data, - src, src, len); - else - gcm_tfm->dec_update(aes_ctx, &data, - src, src, len); - } - left -= len; - scatterwalk_unmap(src); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_done(&src_sg_walk, 1, left); - } - } - gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else + aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); kernel_fpu_end(); if (!assocmem) @@ -787,24 +655,58 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, else kfree(assocmem); - if (!enc) { - u8 authTagMsg[16]; + err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) + : skcipher_walk_aead_decrypt(&walk, req, false); - /* Copy out original authTag */ - scatterwalk_map_and_copy(authTagMsg, req->src, - req->assoclen + req->cryptlen - - auth_tag_len, - auth_tag_len, 0); + while (walk.nbytes > 0) { + kernel_fpu_begin(); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) { + if (enc) + aesni_gcm_enc_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (static_branch_likely(&gcm_use_avx) && do_avx) { + if (enc) + aesni_gcm_enc_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (enc) { + aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } else { + aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } + kernel_fpu_end(); - /* Compare generated tag with passed in tag. */ - return crypto_memneq(authTagMsg, authTag, auth_tag_len) ? - -EBADMSG : 0; + err = skcipher_walk_done(&walk, 0); } - /* Copy in the authTag */ - scatterwalk_map_and_copy(authTag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); + if (err) + return err; + + kernel_fpu_begin(); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, + auth_tag_len); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, + auth_tag_len); + else + aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); + kernel_fpu_end(); return 0; } @@ -812,15 +714,47 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + scatterwalk_map_and_copy(auth_tag, req->dst, + req->assoclen + req->cryptlen, + auth_tag_len, 1); + return 0; } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag_msg[16]; + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + /* Copy out original auth_tag */ + scatterwalk_map_and_copy(auth_tag_msg, req->src, + req->assoclen + req->cryptlen - auth_tag_len, + auth_tag_len, 0); + + /* Compare generated tag with passed in tag. */ + if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { + memzero_explicit(auth_tag, sizeof(auth_tag)); + return -EBADMSG; + } + return 0; } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -828,7 +762,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; __be32 counter = cpu_to_be32(1); @@ -855,7 +790,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; if (unlikely(req->assoclen != 16 && req->assoclen != 20)) @@ -877,6 +813,128 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + keylen /= 2; + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); +} + +static int xts_crypt(struct skcipher_request *req, bool encrypt) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + blocks * AES_BLOCK_SIZE, req->iv); + req = &subreq; + err = skcipher_walk_virt(&walk, req, false); + } else { + tail = 0; + } + + kernel_fpu_begin(); + + /* calculate first value of T */ + aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv); + + while (walk.nbytes > 0) { + int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + + if (walk.nbytes > 0) + kernel_fpu_begin(); + } + + if (unlikely(tail > 0 && !err)) { + struct scatterlist sg_src[2], sg_dst[2]; + struct scatterlist *src, *dst; + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, 0); + } + return err; +} + +static int xts_encrypt(struct skcipher_request *req) +{ + return xts_crypt(req, true); +} + +static int xts_decrypt(struct skcipher_request *req) +{ + return xts_crypt(req, false); +} + static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", @@ -928,6 +986,23 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cts(cbc(aes))", + .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cts_cbc_encrypt, + .decrypt = cts_cbc_decrypt, #ifdef CONFIG_X86_64 }, { .base = { @@ -946,6 +1021,7 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, +#endif }, { .base = { .cra_name = "__xts(aes)", @@ -959,10 +1035,10 @@ static struct skcipher_alg aesni_skciphers[] = { .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, .setkey = xts_aesni_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, -#endif } }; @@ -985,7 +1061,8 @@ static int generic_gcmaes_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); __be32 counter = cpu_to_be32(1); memcpy(iv, req->iv, 12); @@ -1001,7 +1078,8 @@ static int generic_gcmaes_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); memcpy(iv, req->iv, 12); *((__be32 *)(iv+12)) = counter; @@ -1066,19 +1144,18 @@ static int __init aesni_init(void) #ifdef CONFIG_X86_64 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; + static_branch_enable(&gcm_use_avx); + static_branch_enable(&gcm_use_avx2); } else if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; + static_branch_enable(&gcm_use_avx); } else { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_sse; } - aesni_ctr_enc_tfm = aesni_ctr_enc; if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ - aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; + static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } #endif diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index c025a01cf708..a40365ab301e 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -58,138 +58,40 @@ void blake2s_compress_arch(struct blake2s_state *state, } EXPORT_SYMBOL(blake2s_compress_arch); -static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) +static int crypto_blake2s_update_x86(struct shash_desc *desc, + const u8 *in, unsigned int inlen) { - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); - - if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) - return -EINVAL; - - memcpy(tctx->key, key, keylen); - tctx->keylen = keylen; - - return 0; + return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch); } -static int crypto_blake2s_init(struct shash_desc *desc) +static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct blake2s_state *state = shash_desc_ctx(desc); - const int outlen = crypto_shash_digestsize(desc->tfm); - - if (tctx->keylen) - blake2s_init_key(state, outlen, tctx->key, tctx->keylen); - else - blake2s_init(state, outlen); - - return 0; + return crypto_blake2s_final(desc, out, blake2s_compress_arch); } -static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, - unsigned int inlen) -{ - struct blake2s_state *state = shash_desc_ctx(desc); - const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; - - if (unlikely(!inlen)) - return 0; - if (inlen > fill) { - memcpy(state->buf + state->buflen, in, fill); - blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); - state->buflen = 0; - in += fill; - inlen -= fill; +#define BLAKE2S_ALG(name, driver_name, digest_size) \ + { \ + .base.cra_name = name, \ + .base.cra_driver_name = driver_name, \ + .base.cra_priority = 200, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ + .base.cra_module = THIS_MODULE, \ + .digestsize = digest_size, \ + .setkey = crypto_blake2s_setkey, \ + .init = crypto_blake2s_init, \ + .update = crypto_blake2s_update_x86, \ + .final = crypto_blake2s_final_x86, \ + .descsize = sizeof(struct blake2s_state), \ } - if (inlen > BLAKE2S_BLOCK_SIZE) { - const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); - /* Hash one less (full) block than strictly possible */ - blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); - in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); - inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); - } - memcpy(state->buf + state->buflen, in, inlen); - state->buflen += inlen; - - return 0; -} - -static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) -{ - struct blake2s_state *state = shash_desc_ctx(desc); - - blake2s_set_lastblock(state); - memset(state->buf + state->buflen, 0, - BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - blake2s_compress_arch(state, state->buf, 1, state->buflen); - cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); - memcpy(out, state->h, state->outlen); - memzero_explicit(state, sizeof(*state)); - - return 0; -} -static struct shash_alg blake2s_algs[] = {{ - .base.cra_name = "blake2s-128", - .base.cra_driver_name = "blake2s-128-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_128_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-160", - .base.cra_driver_name = "blake2s-160-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_160_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-224", - .base.cra_driver_name = "blake2s-224-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_224_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-256", - .base.cra_driver_name = "blake2s-256-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_256_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}}; +static struct shash_alg blake2s_algs[] = { + BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), + BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), + BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), + BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), +}; static int __init blake2s_mod_init(void) { diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index cedfdba69ce3..a880e0b1c255 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> @@ -247,97 +245,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[4]; - - /* Process four block batch */ - if (nbytes >= bsize * 4) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - } - - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - ctrblocks[3] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 4; - dst += 4; - } while ((nbytes -= bsize * 4) >= bsize * 4); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - ctrblocks[0] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static struct crypto_alg bf_cipher_alg = { .cra_name = "blowfish", .cra_driver_name = "blowfish-asm", @@ -384,20 +291,6 @@ static struct skcipher_alg bf_skcipher_algs[] = { .setkey = blowfish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(blowfish)", - .base.cra_driver_name = "ctr-blowfish-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct bf_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .chunksize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index ecc0a9a905c4..e2a0e0f4bf9d 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,7 +17,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -589,14 +588,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .long 0x80808080 .long 0x80808080 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -/* For XTS mode IV generation */ -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -998,292 +989,3 @@ SYM_FUNC_START(camellia_cbc_dec_16way) FRAME_END ret; SYM_FUNC_END(camellia_cbc_dec_16way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -SYM_FUNC_START(camellia_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lbswap128_mask, %xmm14; - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vpshufb %xmm14, %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - - vpcmpeqd %xmm15, %xmm15, %xmm15; - vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ - - /* construct IVs */ - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 14 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 13 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm12; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm11; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm10; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm9; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm8; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm7; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm6; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm5; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm4; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm3; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm2; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm13); - vmovdqa %xmm0, %xmm13; - vpshufb %xmm14, %xmm0, %xmm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor %xmm0, %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor 13 * 16(%rax), %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - call __camellia_enc_blk16; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rdx), %xmm7, %xmm7; - vpxor 1 * 16(%rdx), %xmm6, %xmm6; - vpxor 2 * 16(%rdx), %xmm5, %xmm5; - vpxor 3 * 16(%rdx), %xmm4, %xmm4; - vpxor 4 * 16(%rdx), %xmm3, %xmm3; - vpxor 5 * 16(%rdx), %xmm2, %xmm2; - vpxor 6 * 16(%rdx), %xmm1, %xmm1; - vpxor 7 * 16(%rdx), %xmm0, %xmm0; - vpxor 8 * 16(%rdx), %xmm15, %xmm15; - vpxor 9 * 16(%rdx), %xmm14, %xmm14; - vpxor 10 * 16(%rdx), %xmm13, %xmm13; - vpxor 11 * 16(%rdx), %xmm12, %xmm12; - vpxor 12 * 16(%rdx), %xmm11, %xmm11; - vpxor 13 * 16(%rdx), %xmm10, %xmm10; - vpxor 14 * 16(%rdx), %xmm9, %xmm9; - vpxor 15 * 16(%rdx), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_16way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; - - /* load IV */ - vmovdqu (%rcx), %xmm0; - vpxor 0 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - vmovdqu %xmm0, 0 * 16(%rsi); - - /* construct IVs */ - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 1 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 14 * 16(%rax); - vmovdqu %xmm0, 1 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 2 * 16(%rdx), %xmm0, %xmm13; - vmovdqu %xmm0, 2 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 3 * 16(%rdx), %xmm0, %xmm12; - vmovdqu %xmm0, 3 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 4 * 16(%rdx), %xmm0, %xmm11; - vmovdqu %xmm0, 4 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 5 * 16(%rdx), %xmm0, %xmm10; - vmovdqu %xmm0, 5 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 6 * 16(%rdx), %xmm0, %xmm9; - vmovdqu %xmm0, 6 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 7 * 16(%rdx), %xmm0, %xmm8; - vmovdqu %xmm0, 7 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 8 * 16(%rdx), %xmm0, %xmm7; - vmovdqu %xmm0, 8 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 9 * 16(%rdx), %xmm0, %xmm6; - vmovdqu %xmm0, 9 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 10 * 16(%rdx), %xmm0, %xmm5; - vmovdqu %xmm0, 10 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 11 * 16(%rdx), %xmm0, %xmm4; - vmovdqu %xmm0, 11 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 12 * 16(%rdx), %xmm0, %xmm3; - vmovdqu %xmm0, 12 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 13 * 16(%rdx), %xmm0, %xmm2; - vmovdqu %xmm0, 13 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 14 * 16(%rdx), %xmm0, %xmm1; - vmovdqu %xmm0, 14 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 15 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 0 * 16(%rax); - vmovdqu %xmm0, 15 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX, %r8, 8), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor 0 * 16(%rax), %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor %xmm13, %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - CALL_NOSPEC r9; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rsi), %xmm7, %xmm7; - vpxor 1 * 16(%rsi), %xmm6, %xmm6; - vpxor 2 * 16(%rsi), %xmm5, %xmm5; - vpxor 3 * 16(%rsi), %xmm4, %xmm4; - vpxor 4 * 16(%rsi), %xmm3, %xmm3; - vpxor 5 * 16(%rsi), %xmm2, %xmm2; - vpxor 6 * 16(%rsi), %xmm1, %xmm1; - vpxor 7 * 16(%rsi), %xmm0, %xmm0; - vpxor 8 * 16(%rsi), %xmm15, %xmm15; - vpxor 9 * 16(%rsi), %xmm14, %xmm14; - vpxor 10 * 16(%rsi), %xmm13, %xmm13; - vpxor 11 * 16(%rsi), %xmm12, %xmm12; - vpxor 12 * 16(%rsi), %xmm11, %xmm11; - vpxor 13 * 16(%rsi), %xmm10, %xmm10; - vpxor 14 * 16(%rsi), %xmm9, %xmm9; - vpxor 15 * 16(%rsi), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_16way) - -SYM_FUNC_START(camellia_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_enc_16way) - -SYM_FUNC_START(camellia_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0907243c501c..782e9712a1ec 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -625,16 +624,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .section .rodata.cst16, "aM", @progbits, 16 .align 16 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -/* For XTS mode */ -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -1061,343 +1050,3 @@ SYM_FUNC_START(camellia_cbc_dec_32way) FRAME_END ret; SYM_FUNC_END(camellia_cbc_dec_32way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -SYM_FUNC_START(camellia_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - movq %rsp, %r10; - cmpq %rsi, %rdx; - je .Lctr_use_stack; - - /* dst can be used as temporary storage, src is not overwritten. */ - movq %rsi, %rax; - jmp .Lctr_continue; - -.Lctr_use_stack: - subq $(16 * 32), %rsp; - movq %rsp, %rax; - -.Lctr_continue: - vpcmpeqd %ymm15, %ymm15, %ymm15; - vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ - vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm14); - vbroadcasti128 .Lbswap128_mask, %ymm14; - vinserti128 $1, %xmm0, %ymm1, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 15 * 32(%rax); - - /* construct IVs */ - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 14 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 13 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 12 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 11 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm10; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm9; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm8; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm7; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm6; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm5; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm4; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm3; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm2; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm1; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vextracti128 $1, %ymm0, %xmm13; - vpshufb %ymm14, %ymm0, %ymm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor 11 * 32(%rax), %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - movq %r10, %rsp; - - vpxor 0 * 32(%rdx), %ymm7, %ymm7; - vpxor 1 * 32(%rdx), %ymm6, %ymm6; - vpxor 2 * 32(%rdx), %ymm5, %ymm5; - vpxor 3 * 32(%rdx), %ymm4, %ymm4; - vpxor 4 * 32(%rdx), %ymm3, %ymm3; - vpxor 5 * 32(%rdx), %ymm2, %ymm2; - vpxor 6 * 32(%rdx), %ymm1, %ymm1; - vpxor 7 * 32(%rdx), %ymm0, %ymm0; - vpxor 8 * 32(%rdx), %ymm15, %ymm15; - vpxor 9 * 32(%rdx), %ymm14, %ymm14; - vpxor 10 * 32(%rdx), %ymm13, %ymm13; - vpxor 11 * 32(%rdx), %ymm12, %ymm12; - vpxor 12 * 32(%rdx), %ymm11, %ymm11; - vpxor 13 * 32(%rdx), %ymm10, %ymm10; - vpxor 14 * 32(%rdx), %ymm9, %ymm9; - vpxor 15 * 32(%rdx), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_32way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 - */ - FRAME_BEGIN - - vzeroupper; - - subq $(16 * 32), %rsp; - movq %rsp, %rax; - - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; - - /* load IV and construct second IV */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm15; - gf128mul_x_ble(%xmm0, %xmm12, %xmm13); - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; - vinserti128 $1, %xmm0, %ymm15, %ymm0; - vpxor 0 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 15 * 32(%rax); - vmovdqu %ymm0, 0 * 32(%rsi); - - /* construct IVs */ - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 1 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 14 * 32(%rax); - vmovdqu %ymm0, 1 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 2 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 13 * 32(%rax); - vmovdqu %ymm0, 2 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 3 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 12 * 32(%rax); - vmovdqu %ymm0, 3 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 4 * 32(%rdx), %ymm0, %ymm11; - vmovdqu %ymm0, 4 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 5 * 32(%rdx), %ymm0, %ymm10; - vmovdqu %ymm0, 5 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 6 * 32(%rdx), %ymm0, %ymm9; - vmovdqu %ymm0, 6 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 7 * 32(%rdx), %ymm0, %ymm8; - vmovdqu %ymm0, 7 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 8 * 32(%rdx), %ymm0, %ymm7; - vmovdqu %ymm0, 8 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 9 * 32(%rdx), %ymm0, %ymm6; - vmovdqu %ymm0, 9 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 10 * 32(%rdx), %ymm0, %ymm5; - vmovdqu %ymm0, 10 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 11 * 32(%rdx), %ymm0, %ymm4; - vmovdqu %ymm0, 11 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 12 * 32(%rdx), %ymm0, %ymm3; - vmovdqu %ymm0, 12 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 13 * 32(%rdx), %ymm0, %ymm2; - vmovdqu %ymm0, 13 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 14 * 32(%rdx), %ymm0, %ymm1; - vmovdqu %ymm0, 14 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 15 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 0 * 32(%rax); - vmovdqu %ymm0, 15 * 32(%rsi); - - vextracti128 $1, %ymm0, %xmm0; - gf128mul_x_ble(%xmm0, %xmm12, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor 0 * 32(%rax), %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - CALL_NOSPEC r9; - - addq $(16 * 32), %rsp; - - vpxor 0 * 32(%rsi), %ymm7, %ymm7; - vpxor 1 * 32(%rsi), %ymm6, %ymm6; - vpxor 2 * 32(%rsi), %ymm5, %ymm5; - vpxor 3 * 32(%rsi), %ymm4, %ymm4; - vpxor 4 * 32(%rsi), %ymm3, %ymm3; - vpxor 5 * 32(%rsi), %ymm2, %ymm2; - vpxor 6 * 32(%rsi), %ymm1, %ymm1; - vpxor 7 * 32(%rsi), %ymm0, %ymm0; - vpxor 8 * 32(%rsi), %ymm15, %ymm15; - vpxor 9 * 32(%rsi), %ymm14, %ymm14; - vpxor 10 * 32(%rsi), %ymm13, %ymm13; - vpxor 11 * 32(%rsi), %ymm12, %ymm12; - vpxor 12 * 32(%rsi), %ymm11, %ymm11; - vpxor 13 * 32(%rsi), %ymm10, %ymm10; - vpxor 14 * 32(%rsi), %ymm9, %ymm9; - vpxor 15 * 32(%rsi), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_32way) - -SYM_FUNC_START(camellia_xts_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_enc_32way) - -SYM_FUNC_START(camellia_xts_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_dec_32way) diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/crypto/camellia.h index f6d91861cb14..1dcea79e8f8e 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/crypto/camellia.h @@ -19,18 +19,10 @@ struct camellia_ctx { u32 key_length; }; -struct camellia_xts_ctx { - struct camellia_ctx tweak_ctx; - struct camellia_ctx crypt_ctx; -}; - extern int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, unsigned int key_len); -extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen); - /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, bool xor); @@ -46,13 +38,6 @@ asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) { @@ -78,14 +63,5 @@ static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, /* glue helpers */ extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); -extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -extern void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); #endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index ccda647422d6..e7e4d64e9577 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -5,16 +5,16 @@ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> +#include "camellia.h" +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 @@ -23,121 +23,6 @@ asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -147,45 +32,39 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { @@ -216,35 +95,6 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 4e5de6ef206e..c7ccf63e741e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -5,16 +5,16 @@ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> +#include "camellia.h" +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ @@ -27,120 +27,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_ctr_16way); - -asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); - -asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); - -void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_enc); - -void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_dec); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -149,65 +35,36 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - -int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_camellia_setkey); - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { @@ -238,36 +95,7 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, + } }; static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 242c056e5fa8..66c435ba9d3d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -14,8 +14,9 @@ #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> + +#include "camellia.h" +#include "ecb_cbc_helpers.h" /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, @@ -1262,129 +1263,47 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return camellia_setkey(&tfm->base, key, key_len); } -void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) +void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src) { - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - u128 iv = *src; - - camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); + u8 buf[CAMELLIA_BLOCK_SIZE]; + const u8 *iv = src; - u128_xor(&dst[1], &dst[1], &iv); + if (dst == src) + iv = memcpy(buf, iv, sizeof(buf)); + camellia_dec_blk_2way(ctx, dst, src); + crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE); } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr); - -void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - - camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct crypto_alg camellia_cipher_alg = { @@ -1433,20 +1352,6 @@ static struct skcipher_alg camellia_skcipher_algs[] = { .setkey = camellia_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(camellia)", - .base.cra_driver_name = "ctr-camellia-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 384ccb00f9e1..3976a87f92ad 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -6,7 +6,6 @@ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> */ -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/cast5.h> #include <crypto/internal/simd.h> @@ -15,6 +14,8 @@ #include <linux/module.h> #include <linux/types.h> +#include "ecb_cbc_helpers.h" + #define CAST5_PARALLEL_BLOCKS 16 asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, @@ -23,8 +24,6 @@ asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); -asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -32,272 +31,35 @@ static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return cast5_setkey(&tfm->base, key, keylen); } -static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk, - unsigned int nbytes) -{ - return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS, - walk, fpu_enabled, nbytes); -} - -static inline void cast5_fpu_end(bool fpu_enabled) -{ - return glue_fpu_end(fpu_enabled); -} - -static int ecb_crypt(struct skcipher_request *req, bool enc) -{ - bool fpu_enabled = false; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes; - void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; - u8 *wdst = walk.dst.virt.addr; - - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize * CAST5_PARALLEL_BLOCKS; - wdst += bsize * CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - fn = (enc) ? __cast5_encrypt : __cast5_decrypt; - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; -} - static int ecb_encrypt(struct skcipher_request *req) { - return ecb_crypt(req, true); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way); + ECB_BLOCK(1, __cast5_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return ecb_crypt(req, false); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way); + ECB_BLOCK(1, __cast5_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - const unsigned int bsize = CAST5_BLOCK_SIZE; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u64 *src = (u64 *)walk.src.virt.addr; - u64 *dst = (u64 *)walk.dst.virt.addr; - u64 *iv = (u64 *)walk.iv; - - do { - *dst = *src ^ *iv; - __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct cast5_ctx *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1); - src -= CAST5_PARALLEL_BLOCKS - 1; - dst -= CAST5_PARALLEL_BLOCKS - 1; - - cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - } - - /* Handle leftovers */ - for (;;) { - __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast5_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __cbc_decrypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; -} - -static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx) -{ - u8 *ctrblk = walk->iv; - u8 keystream[CAST5_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, CAST5_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct skcipher_walk *walk, - struct cast5_ctx *ctx) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += CAST5_PARALLEL_BLOCKS; - dst += CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - *dst ^= ctrblk; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __ctr_crypt(&walk, ctx); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(&walk, ctx); - err = skcipher_walk_done(&walk, 0); - } - - return err; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way); + CBC_DEC_BLOCK(1, __cast5_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast5_algs[] = { @@ -328,21 +90,6 @@ static struct skcipher_alg cast5_algs[] = { .setkey = cast5_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast5)", - .base.cra_driver_name = "__ctr-cast5-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast5_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .chunksize = CAST5_BLOCK_SIZE, - .setkey = cast5_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 932a3ce32a88..fbddcecc3e3f 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -212,8 +212,6 @@ .section .rodata.cst16, "aM", @progbits, 16 .align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: @@ -412,85 +410,3 @@ SYM_FUNC_START(cast6_cbc_dec_8way) FRAME_END ret; SYM_FUNC_END(cast6_cbc_dec_8way) - -SYM_FUNC_START(cast6_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15 - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX, RKR, RKM); - - call __cast6_enc_blk8; - - store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - popq %r12; - FRAME_END - ret; -SYM_FUNC_END(cast6_ctr_8way) - -SYM_FUNC_START(cast6_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_enc_8way) - -SYM_FUNC_START(cast6_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 48e0f37796fa..7e2aea372349 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -15,8 +15,8 @@ #include <crypto/algapi.h> #include <crypto/cast6.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> + +#include "ecb_cbc_helpers.h" #define CAST6_PARALLEL_BLOCKS 8 @@ -24,13 +24,6 @@ asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -38,172 +31,35 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt); -} - -static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt); -} - -static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - -static const struct common_glue_ctx cast6_enc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_encrypt } - } } -}; - -static const struct common_glue_ctx cast6_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = cast6_ctr_8way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = cast6_crypt_ctr } - } } -}; - -static const struct common_glue_ctx cast6_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_enc } - } } -}; - -static const struct common_glue_ctx cast6_dec = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_decrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = cast6_cbc_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __cast6_decrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_enc, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way); + ECB_BLOCK(1, __cast6_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_dec, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way); + ECB_BLOCK(1, __cast6_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast6_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&cast6_ctr, req); -} - -struct cast6_xts_ctx { - struct cast6_ctx tweak_ctx; - struct cast6_ctx crypt_ctx; -}; - -static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way); + CBC_DEC_BLOCK(1, __cast6_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast6_algs[] = { @@ -234,35 +90,6 @@ static struct skcipher_alg cast6_algs[] = { .setkey = cast6_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast6)", - .base.cra_driver_name = "__ctr-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast6_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .chunksize = CAST6_BLOCK_SIZE, - .setkey = cast6_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(cast6)", - .base.cra_driver_name = "__xts-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAST6_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cast6_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAST6_MIN_KEY_SIZE, - .max_keysize = 2 * CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = xts_cast6_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 89830e531350..e7cb68a3db3b 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> @@ -253,94 +251,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[DES3_EDE_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - unsigned int bsize = DES3_EDE_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - __be64 *src = (__be64 *)walk->src.virt.addr; - __be64 *dst = (__be64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[3]; - - /* Process four block batch */ - if (nbytes >= bsize * 3) { - do { - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks, - (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - dst[1] = src[1] ^ ctrblocks[1]; - dst[2] = src[2] ^ ctrblocks[2]; - - src += 3; - dst += 3; - } while ((nbytes -= bsize * 3) >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - ctrblocks[0] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { @@ -428,20 +338,6 @@ static struct skcipher_alg des3_ede_skciphers[] = { .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(des3_ede)", - .base.cra_driver_name = "ctr-des3_ede-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .chunksize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_x86_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h new file mode 100644 index 000000000000..eaa15c7b29d6 --- /dev/null +++ b/arch/x86/crypto/ecb_cbc_helpers.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRYPTO_ECB_CBC_HELPER_H +#define _CRYPTO_ECB_CBC_HELPER_H + +#include <crypto/internal/skcipher.h> +#include <asm/fpu/api.h> + +/* + * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without + * having to rely on indirect calls and retpolines. + */ + +#define ECB_WALK_START(req, bsize, fpu_blocks) do { \ + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \ + const int __bsize = (bsize); \ + struct skcipher_walk walk; \ + int err = skcipher_walk_virt(&walk, (req), false); \ + while (walk.nbytes > 0) { \ + unsigned int nbytes = walk.nbytes; \ + bool do_fpu = (fpu_blocks) != -1 && \ + nbytes >= (fpu_blocks) * __bsize; \ + const u8 *src = walk.src.virt.addr; \ + u8 *dst = walk.dst.virt.addr; \ + u8 __maybe_unused buf[(bsize)]; \ + if (do_fpu) kernel_fpu_begin() + +#define CBC_WALK_START(req, bsize, fpu_blocks) \ + ECB_WALK_START(req, bsize, fpu_blocks) + +#define ECB_WALK_ADVANCE(blocks) do { \ + dst += (blocks) * __bsize; \ + src += (blocks) * __bsize; \ + nbytes -= (blocks) * __bsize; \ +} while (0) + +#define ECB_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + (func)(ctx, dst, src); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define CBC_ENC_BLOCK(func) do { \ + const u8 *__iv = walk.iv; \ + while (nbytes >= __bsize) { \ + crypto_xor_cpy(dst, src, __iv, __bsize); \ + (func)(ctx, dst, dst); \ + __iv = dst; \ + ECB_WALK_ADVANCE(1); \ + } \ + memcpy(walk.iv, __iv, __bsize); \ +} while (0) + +#define CBC_DEC_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + const u8 *__iv = src + ((blocks) - 1) * __bsize; \ + if (dst == src) \ + __iv = memcpy(buf, __iv, __bsize); \ + (func)(ctx, dst, src); \ + crypto_xor(dst, walk.iv, __bsize); \ + memcpy(walk.iv, __iv, __bsize); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define ECB_WALK_END() \ + if (do_fpu) kernel_fpu_end(); \ + err = skcipher_walk_done(&walk, nbytes); \ + } \ + return err; \ +} while (0) + +#define CBC_WALK_END() ECB_WALK_END() + +#endif diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index d08fc575ef7f..3da385271227 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -34,107 +34,3 @@ vpxor (5*16)(src), x6, x6; \ vpxor (6*16)(src), x7, x7; \ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ - vmovdqa bswap, t1; \ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), x7; \ - vpshufb t1, x7, x0; \ - \ - /* construct IVs */ \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x1; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x2; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x3; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x4; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x5; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x6; \ - inc_le128(x7, t0, t2); \ - vmovdqa x7, t2; \ - vpshufb t1, x7, x7; \ - inc_le128(t2, t0, t1); \ - vmovdqu t2, (iv); - -#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(src), x0, x0; \ - vpxor (1*16)(src), x1, x1; \ - vpxor (2*16)(src), x2, x2; \ - vpxor (3*16)(src), x3, x3; \ - vpxor (4*16)(src), x4, x4; \ - vpxor (5*16)(src), x5, x5; \ - vpxor (6*16)(src), x6, x6; \ - vpxor (7*16)(src), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ - t1, xts_gf128mul_and_shl1_mask) \ - vmovdqa xts_gf128mul_and_shl1_mask, t0; \ - \ - /* load IV */ \ - vmovdqu (iv), tiv; \ - vpxor (0*16)(src), tiv, x0; \ - vmovdqu tiv, (0*16)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (1*16)(src), tiv, x1; \ - vmovdqu tiv, (1*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (2*16)(src), tiv, x2; \ - vmovdqu tiv, (2*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (3*16)(src), tiv, x3; \ - vmovdqu tiv, (3*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (4*16)(src), tiv, x4; \ - vmovdqu tiv, (4*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (5*16)(src), tiv, x5; \ - vmovdqu tiv, (5*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (6*16)(src), tiv, x6; \ - vmovdqu tiv, (6*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (7*16)(src), tiv, x7; \ - vmovdqu tiv, (7*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vmovdqu tiv, (iv); - -#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(dst), x0, x0; \ - vpxor (1*16)(dst), x1, x1; \ - vpxor (2*16)(dst), x2, x2; \ - vpxor (3*16)(dst), x3, x3; \ - vpxor (4*16)(dst), x4, x4; \ - vpxor (5*16)(dst), x5, x5; \ - vpxor (6*16)(dst), x6, x6; \ - vpxor (7*16)(dst), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S index d84508c85c13..c77e9049431f 100644 --- a/arch/x86/crypto/glue_helper-asm-avx2.S +++ b/arch/x86/crypto/glue_helper-asm-avx2.S @@ -37,139 +37,3 @@ vpxor (5*32+16)(src), x6, x6; \ vpxor (6*32+16)(src), x7, x7; \ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ - t1x, t2, t2x, t3, t3x, t4, t5) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ - vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), t2x; \ - vmovdqa t2x, t3x; \ - inc_le128(t2x, t0x, t1x); \ - vbroadcasti128 bswap, t1; \ - vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ - vpshufb t1, t2, x0; \ - \ - /* construct IVs */ \ - add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ - vpshufb t1, t2, x1; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x2; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x3; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x4; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x5; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x6; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x7; \ - vextracti128 $1, t2, t2x; \ - inc_le128(t2x, t0x, t3x); \ - vmovdqu t2x, (iv); - -#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(src), x0, x0; \ - vpxor (1*32)(src), x1, x1; \ - vpxor (2*32)(src), x2, x2; \ - vpxor (3*32)(src), x3, x3; \ - vpxor (4*32)(src), x4, x4; \ - vpxor (5*32)(src), x5, x5; \ - vpxor (6*32)(src), x6, x6; \ - vpxor (7*32)(src), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ - tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ - xts_gf128mul_and_shl1_mask_0, \ - xts_gf128mul_and_shl1_mask_1) \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ - \ - /* load IV and construct second IV */ \ - vmovdqu (iv), tivx; \ - vmovdqa tivx, t0x; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ - vinserti128 $1, tivx, t0, tiv; \ - vpxor (0*32)(src), tiv, x0; \ - vmovdqu tiv, (0*32)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (1*32)(src), tiv, x1; \ - vmovdqu tiv, (1*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (2*32)(src), tiv, x2; \ - vmovdqu tiv, (2*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (3*32)(src), tiv, x3; \ - vmovdqu tiv, (3*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (4*32)(src), tiv, x4; \ - vmovdqu tiv, (4*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (5*32)(src), tiv, x5; \ - vmovdqu tiv, (5*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (6*32)(src), tiv, x6; \ - vmovdqu tiv, (6*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (7*32)(src), tiv, x7; \ - vmovdqu tiv, (7*32)(dst); \ - \ - vextracti128 $1, tiv, tivx; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vmovdqu tivx, (iv); - -#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(dst), x0, x0; \ - vpxor (1*32)(dst), x1, x1; \ - vpxor (2*32)(dst), x2, x2; \ - vpxor (3*32)(dst), x3, x3; \ - vpxor (4*32)(dst), x4, x4; \ - vpxor (5*32)(dst), x5, x5; \ - vpxor (6*32)(dst), x6, x6; \ - vpxor (7*32)(dst), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c deleted file mode 100644 index d3d91a0abf88..000000000000 --- a/arch/x86/crypto/glue_helper.c +++ /dev/null @@ -1,381 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Shared glue code for 128bit block ciphers - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - */ - -#include <linux/module.h> -#include <crypto/b128ops.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> - -int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u8 *src = walk.src.virt.addr; - u8 *dst = walk.dst.virt.addr; - unsigned int func_bytes; - unsigned int i; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - for (i = 0; i < gctx->num_funcs; i++) { - func_bytes = bsize * gctx->funcs[i].num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ecb(ctx, dst, src); - src += func_bytes; - dst += func_bytes; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); - -int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = (u128 *)walk.src.virt.addr; - u128 *dst = (u128 *)walk.dst.virt.addr; - u128 *iv = (u128 *)walk.iv; - - do { - u128_xor(dst, src, iv); - fn(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u128 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit); - -int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - u128 last_iv; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - src -= num_blocks - 1; - dst -= num_blocks - 1; - - gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst, - (const u8 *)src); - - nbytes -= func_bytes; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, --src); - dst--; - } while (nbytes >= func_bytes); - } -done: - u128_xor(dst, dst, (u128 *)walk.iv); - *(u128 *)walk.iv = last_iv; - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); - -int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= bsize) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - le128 ctrblk; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst, - (const u8 *)src, - &ctrblk); - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - - le128_to_be128((be128 *)walk.iv, &ctrblk); - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - - if (nbytes) { - le128 ctrblk; - u128 tmp; - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - memcpy(&tmp, walk.src.virt.addr, nbytes); - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp, - (const u8 *)&tmp, - &ctrblk); - memcpy(walk.dst.virt.addr, &tmp, nbytes); - le128_to_be128((be128 *)walk.iv, &ctrblk); - - err = skcipher_walk_done(&walk, 0); - } - - return err; -} -EXPORT_SYMBOL_GPL(glue_ctr_req_128bit); - -static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, - void *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = walk->src.virt.addr; - u128 *dst = walk->dst.virt.addr; - unsigned int num_blocks, func_bytes; - unsigned int i; - - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst, - (const u8 *)src, - walk->iv); - - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - return nbytes; -} - -int glue_xts_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx, bool decrypt) -{ - const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); - const unsigned int bsize = 128 / 8; - struct skcipher_request subreq; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes, tail; - int err; - - if (req->cryptlen < XTS_BLOCK_SIZE) - return -EINVAL; - - if (unlikely(cts)) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - - tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE; - - skcipher_request_set_tfm(&subreq, tfm); - skcipher_request_set_callback(&subreq, - crypto_skcipher_get_flags(tfm), - NULL, NULL); - skcipher_request_set_crypt(&subreq, req->src, req->dst, - req->cryptlen - tail, req->iv); - req = &subreq; - } - - err = skcipher_walk_virt(&walk, req, false); - nbytes = walk.nbytes; - if (err) - return err; - - /* set minimum length to bsize, for tweak_fn */ - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, - nbytes < bsize ? bsize : nbytes); - - /* calculate first value of T */ - tweak_fn(tweak_ctx, walk.iv, walk.iv); - - while (nbytes) { - nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); - - err = skcipher_walk_done(&walk, nbytes); - nbytes = walk.nbytes; - } - - if (unlikely(cts)) { - u8 *next_tweak, *final_tweak = req->iv; - struct scatterlist *src, *dst; - struct scatterlist s[2], d[2]; - le128 b[2]; - - dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(d, req->dst, req->cryptlen); - - if (decrypt) { - next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE); - gf128mul_x_ble(b, b); - } else { - next_tweak = req->iv; - } - - skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE, - next_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - if (err) - goto out; - - scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0); - memcpy(b + 1, b, tail - XTS_BLOCK_SIZE); - scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE, - tail - XTS_BLOCK_SIZE, 0); - scatterwalk_map_and_copy(b, dst, 0, tail, 1); - - skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE, - final_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - } - -out: - glue_fpu_end(fpu_enabled); - - return err; -} -EXPORT_SYMBOL_GPL(glue_xts_req_128bit); - -void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src, - le128 *iv, common_glue_func_t fn) -{ - le128 ivblk = *iv; - - /* generate next IV */ - gf128mul_x_ble(iv, &ivblk); - - /* CC <- T xor C */ - u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk); - - /* PP <- D(Key2,CC) */ - fn(ctx, dst, dst); - - /* P <- T xor PP */ - u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk); -} -EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 7d568012cc15..71fae5a09e56 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -251,7 +251,7 @@ $code.=<<___; mov %rax,8($ctx) mov %rax,16($ctx) - cmp \$0,$inp + test $inp,$inp je .Lno_key ___ $code.=<<___ if (!$kernel); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index c44aba290fbb..646da46e8d10 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -210,7 +210,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) } poly1305_simd_emit(&dctx->h, dst, dctx->s); - *dctx = (struct poly1305_desc_ctx){}; + memzero_explicit(dctx, sizeof(*dctx)); } EXPORT_SYMBOL(poly1305_final_arch); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index ba9e4c1e7f5c..b7ee24df7fba 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -18,10 +18,6 @@ .align 16 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .text @@ -715,67 +711,3 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx) FRAME_END ret; SYM_FUNC_END(serpent_cbc_dec_8way_avx) - -SYM_FUNC_START(serpent_ctr_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK1, RK2); - - call __serpent_enc_blk8_avx; - - store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_8way_avx) - -SYM_FUNC_START(serpent_xts_enc_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_enc_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_8way_avx) - -SYM_FUNC_START(serpent_xts_dec_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_dec_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h new file mode 100644 index 000000000000..23f3361a0e72 --- /dev/null +++ b/arch/x86/crypto/serpent-avx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include <crypto/b128ops.h> +#include <crypto/serpent.h> +#include <linux/types.h> + +struct crypto_skcipher; + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +#endif diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index c9648aeae705..9161b6e441f3 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -20,16 +20,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - .text #define CTX %rdi @@ -734,80 +724,3 @@ SYM_FUNC_START(serpent_cbc_dec_16way) FRAME_END ret; SYM_FUNC_END(serpent_cbc_dec_16way) - -SYM_FUNC_START(serpent_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - tp); - - call __serpent_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_16way) - -SYM_FUNC_START(serpent_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_enc_blk16; - - store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_16way) - -SYM_FUNC_START(serpent_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_dec_blk16; - - store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_16way) diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h index 860ca248914b..860ca248914b 100644 --- a/arch/x86/include/asm/crypto/serpent-sse2.h +++ b/arch/x86/crypto/serpent-sse2.h diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index f973ace44ad3..ccf0b5fa4933 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -12,9 +12,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/serpent-avx.h> + +#include "serpent-avx.h" +#include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 @@ -23,158 +23,44 @@ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = serpent_ctr_16way } - }, { - .num_blocks = 8, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = serpent_cbc_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -205,35 +91,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7806d1cbe854..6c248e1ea4ef 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -15,9 +15,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/serpent-avx.h> + +#include "serpent-avx.h" +#include "ecb_cbc_helpers.h" /* 8-way parallel cipher functions */ asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, @@ -32,191 +32,41 @@ asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); -asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); - -asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); - -asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); - -void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); - -void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_enc); - -void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_dec); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_serpent_setkey); - -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -247,35 +97,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4fed8d26b91a..d78f37e9b2cf 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -10,8 +10,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <linux/module.h> @@ -22,8 +20,9 @@ #include <crypto/b128ops.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <asm/crypto/serpent-sse2.h> -#include <asm/crypto/glue_helper.h> + +#include "serpent-sse2.h" +#include "ecb_cbc_helpers.h" static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -31,130 +30,46 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) -{ - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int j; - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; - - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} - -static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - -static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s, - le128 *iv) +static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src) { - be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int i; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - le128_to_be128(&ctrblks[i], iv); - le128_inc(iv); - } + u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE]; + const u8 *s = src; - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + serpent_dec_blk_xway(ctx, dst, src); + crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf)); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_enc_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_crypt_ctr_xway } - }, { - .num_blocks = 1, - .fn_u = { .ctr = serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_dec_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_decrypt_cbc_xway } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, - req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -185,21 +100,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-sse2", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 18200135603f..44340a1139e0 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -22,7 +22,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <asm/simd.h> diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index dd06249229e1..3a5f6be7dbba 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -35,7 +35,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <linux/string.h> #include <asm/simd.h> diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 63470fd6ae32..684d58c8bc4f 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -278,7 +278,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_avx) - cmp $0, msglen + test msglen, msglen je nowork # Allocate Stack Space diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 7946a1bee85b..50812af0b083 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -280,7 +280,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE ######################################################################## SYM_FUNC_START(sha512_transform_ssse3) - cmp $0, msglen + test msglen, msglen je nowork # Allocate Stack Space diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index b0b05c93409e..30e70f4fe2f7 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -34,7 +34,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/simd.h> diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index a5151393bb2f..37e63b3c664e 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -19,11 +19,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - .text /* structure of crypto context */ @@ -379,78 +374,3 @@ SYM_FUNC_START(twofish_cbc_dec_8way) FRAME_END ret; SYM_FUNC_END(twofish_cbc_dec_8way) - -SYM_FUNC_START(twofish_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - pushq %r12; - - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX0, RX1, RY0); - - call __twofish_enc_blk8; - - store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - popq %r12; - - FRAME_END - ret; -SYM_FUNC_END(twofish_ctr_8way) - -SYM_FUNC_START(twofish_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_enc_8way) - -SYM_FUNC_START(twofish_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_dec_8way) diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/crypto/twofish.h index 2c377a8042e1..12df400e6d53 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/crypto/twofish.h @@ -17,9 +17,5 @@ asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); -extern void twofish_enc_blk_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void twofish_enc_blk_ctr_3way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); #endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2dbc8ce3730e..3eb3440b477a 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -15,9 +15,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/twofish.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/twofish.h> + +#include "twofish.h" +#include "ecb_cbc_helpers.h" #define TWOFISH_PARALLEL_BLOCKS 8 @@ -26,13 +26,6 @@ asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -45,171 +38,38 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk); -} - -static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk); -} - -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_enc_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = twofish_ctr_8way } - }, { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_enc } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = twofish_cbc_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg twofish_algs[] = { @@ -240,35 +100,6 @@ static struct skcipher_alg twofish_algs[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(twofish)", - .base.cra_driver_name = "__ctr-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(twofish)", - .base.cra_driver_name = "__xts-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = TF_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct twofish_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * TF_MIN_KEY_SIZE, - .max_keysize = 2 * TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 768af6075479..03725696397c 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,17 +5,16 @@ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/twofish.h> #include <crypto/algapi.h> -#include <crypto/b128ops.h> -#include <crypto/internal/skcipher.h> #include <crypto/twofish.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> +#include "twofish.h" +#include "ecb_cbc_helpers.h" + EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); @@ -30,143 +29,48 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, true); -} - -void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) +void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src) { - u128 ivs[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - ivs[0] = src[0]; - ivs[1] = src[1]; + u8 buf[2][TF_BLOCK_SIZE]; + const u8 *s = src; - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + twofish_dec_blk_3way(ctx, dst, src); + crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf)); - u128_xor(&dst[1], &dst[1], &ivs[0]); - u128_xor(&dst[2], &dst[2], &ivs[1]); } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, dst, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); - -void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[3]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[2], iv); - le128_inc(iv); - - twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg tf_skciphers[] = { @@ -195,20 +99,6 @@ static struct skcipher_alg tf_skciphers[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(twofish)", - .base.cra_driver_name = "ctr-twofish-3way", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..4efd39aacb9f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -73,10 +73,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, unsigned int nr) { if (likely(nr < IA32_NR_syscalls)) { - instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); - instrumentation_end(); } } @@ -91,8 +89,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) * or may not be necessary, but it matches the old asm behavior. */ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); } @@ -121,12 +122,14 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } - instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; - syscall_exit_to_user_mode(regs); + + instrumentation_end(); + local_irq_disable(); + irqentry_exit_to_user_mode(regs); return false; } @@ -135,6 +138,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); return true; } @@ -209,40 +214,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -noinstr bool idtentry_enter_nmi(struct pt_regs *regs) -{ - bool irq_state = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - rcu_nmi_enter(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - instrumentation_end(); - - return irq_state; -} - -noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) -{ - instrumentation_begin(); - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - } - instrumentation_end(); - - rcu_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* @@ -279,30 +250,23 @@ static __always_inline bool get_and_clear_inhcall(void) { return false; } static __always_inline void restore_inhcall(bool inhcall) { } #endif -static void __xen_pv_evtchn_do_upcall(void) +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) { - irq_enter_rcu(); + struct pt_regs *old_regs = set_irq_regs(regs); + inc_irq_stat(irq_hv_callback_count); xen_hvm_evtchn_do_upcall(); - irq_exit_rcu(); + set_irq_regs(old_regs); } __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { - struct pt_regs *old_regs; + irqentry_state_t state = irqentry_enter(regs); bool inhcall; - irqentry_state_t state; - - state = irqentry_enter(regs); - old_regs = set_irq_regs(regs); - instrumentation_begin(); - run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); - instrumentation_begin(); - - set_irq_regs(old_regs); + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); inhcall = get_and_clear_inhcall(); if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index cad08703c4ad..400908dff42e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -46,14 +46,6 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT_XXL -SYM_CODE_START(native_usergs_sysret64) - UNWIND_HINT_EMPTY - swapgs - sysretq -SYM_CODE_END(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT_XXL */ - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. + * In the Xen PV case we must use iret anyway. */ + + ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ + X86_FEATURE_XENPV + movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 @@ -215,7 +212,8 @@ syscall_return_via_sysret: popq %rdi popq %rsp - USERGS_SYSRET64 + swapgs + sysretq SYM_CODE_END(entry_SYSCALL_64) /* @@ -669,7 +667,7 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS /* to kernel GS */ + swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ movq PER_CPU_VAR(espfix_waddr), %rdi @@ -699,7 +697,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - SWAPGS /* to user GS */ + swapgs /* to user GS */ popq %rdi /* Restore user RDI */ movq %rax, %rsp @@ -756,47 +754,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) SYM_CODE_END(.Lbad_gs) .previous -/* - * rdi: New stack pointer points to the top word of the stack - * rsi: Function pointer - * rdx: Function argument (can be NULL if none) - */ -SYM_FUNC_START(asm_call_on_stack) -SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) -SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) - /* - * Save the frame pointer unconditionally. This allows the ORC - * unwinder to handle the stack switch. - */ - pushq %rbp - mov %rsp, %rbp - - /* - * The unwinder relies on the word at the top of the new stack - * page linking back to the previous RSP. - */ - mov %rsp, (%rdi) - mov %rdi, %rsp - /* Move the argument to the right place */ - mov %rdx, %rdi - -1: - .pushsection .discard.instr_begin - .long 1b - . - .popsection - - CALL_NOSPEC rsi - -2: - .pushsection .discard.instr_end - .long 2b - . - .popsection - - /* Restore the previous stack pointer from RBP. */ - leaveq - ret -SYM_FUNC_END(asm_call_on_stack) - #ifdef CONFIG_XEN_PV /* * A note on the "critical region" in our callback handler. @@ -943,7 +900,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) ret .Lparanoid_entry_swapgs: - SWAPGS + swapgs /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an @@ -1001,7 +958,7 @@ SYM_CODE_START_LOCAL(paranoid_exit) jnz restore_regs_and_return_to_kernel /* We are returning to a context with user GSBASE */ - SWAPGS_UNSAFE_STACK + swapgs jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) @@ -1426,7 +1383,7 @@ nmi_no_fsgsbase: jnz nmi_restore nmi_swapgs: - SWAPGS_UNSAFE_STACK + swapgs nmi_restore: POP_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 6fb9b57ed5ba..d8c4f6c9eadc 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -6,8 +6,8 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') -syscall32 := $(srctree)/$(src)/syscall_32.tbl -syscall64 := $(srctree)/$(src)/syscall_64.tbl +syscall32 := $(src)/syscall_32.tbl +syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh @@ -21,37 +21,37 @@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ - cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) syshdr_abi_unistd_32 := i386 -$(uapi)/unistd_32.h: $(syscall32) $(syshdr) +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_32_ia32 := i386 syshdr_pfx_unistd_32_ia32 := ia32_ -$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_x32 := common,x32 syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT -$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall64) $(syshdr) +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) syshdr_abi_unistd_64_x32 := x32 syshdr_pfx_unistd_64_x32 := x32_ -$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -$(out)/syscalls_32.h: $(syscall32) $(systbl) +$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE $(call if_changed,systbl) -$(out)/syscalls_64.h: $(syscall64) $(systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE $(call if_changed,systbl) -$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE $(call if_changed,hypercalls) $(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h @@ -62,9 +62,10 @@ syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h syshdr-$(CONFIG_XEN) += xen-hypercalls.h -targets += $(uapisyshdr-y) $(syshdr-y) +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +syshdr-y := $(addprefix $(out)/, $(syshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(syshdr-y)) PHONY += all -all: $(addprefix $(uapi)/,$(uapisyshdr-y)) -all: $(addprefix $(out)/,$(syshdr-y)) +all: $(uapisyshdr-y) $(syshdr-y) @: diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0d0667a9fbd7..a1c9f496fca6 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -445,3 +445,5 @@ 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 440 i386 process_madvise sys_process_madvise +441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 i386 mount_setattr sys_mount_setattr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..7bf01cbe582f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,6 +362,8 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index ccd32877a3c4..496b11ec469d 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -10,7 +10,7 @@ #include <asm/export.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func, put_ret_addr_in_rdi=0 + .macro THUNK name, func SYM_FUNC_START_NOALIGN(\name) pushq %rbp movq %rsp, %rbp @@ -25,13 +25,8 @@ SYM_FUNC_START_NOALIGN(\name) pushq %r10 pushq %r11 - .if \put_ret_addr_in_rdi - /* 8(%rbp) is return addr on stack */ - movq 8(%rbp), %rdi - .endif - call \func - jmp .L_restore + jmp __thunk_restore SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm @@ -44,7 +39,7 @@ SYM_FUNC_END(\name) #endif #ifdef CONFIG_PREEMPTION -SYM_CODE_START_LOCAL_NOALIGN(.L_restore) +SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) popq %r11 popq %r10 popq %r9 @@ -56,6 +51,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %rdi popq %rbp ret - _ASM_NOKPROBE(.L_restore) -SYM_CODE_END(.L_restore) + _ASM_NOKPROBE(__thunk_restore) +SYM_CODE_END(__thunk_restore) #endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 21243747965d..05c4abc2fdfd 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -27,9 +27,10 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o +vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o +obj-y += vma.o extable.o KASAN_SANITIZE_vma.o := y UBSAN_SANITIZE_vma.o := y KCSAN_SANITIZE_vma.o := y @@ -90,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -98,6 +99,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vsgx.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. @@ -128,8 +130,8 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE +$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table +$(obj)/%.so: $(obj)/%.so.dbg $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE @@ -148,6 +150,7 @@ KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c new file mode 100644 index 000000000000..afcf5b65beef --- /dev/null +++ b/arch/x86/entry/vdso/extable.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/err.h> +#include <linux/mm.h> +#include <asm/current.h> +#include <asm/traps.h> +#include <asm/vdso.h> + +struct vdso_exception_table_entry { + int insn, fixup; +}; + +bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + const struct vdso_image *image = current->mm->context.vdso_image; + const struct vdso_exception_table_entry *extable; + unsigned int nr_entries, i; + unsigned long base; + + /* + * Do not attempt to fixup #DB or #BP. It's impossible to identify + * whether or not a #DB/#BP originated from within an SGX enclave and + * SGX enclaves are currently the only use case for vDSO fixup. + */ + if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) + return false; + + if (!current->mm->context.vdso) + return false; + + base = (unsigned long)current->mm->context.vdso + image->extable_base; + nr_entries = image->extable_len / (sizeof(*extable)); + extable = image->extable; + + for (i = 0; i < nr_entries; i++) { + if (regs->ip == base + extable[i].insn) { + regs->ip = base + extable[i].fixup; + regs->di = trapnr; + regs->si = error_code; + regs->dx = fault_addr; + return true; + } + } + + return false; +} diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h new file mode 100644 index 000000000000..b56f6b012941 --- /dev/null +++ b/arch/x86/entry/vdso/extable.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_EXTABLE_H +#define __VDSO_EXTABLE_H + +/* + * Inject exception fixup for vDSO code. Unlike normal exception fixup, + * vDSO uses a dedicated handler the addresses are relative to the overall + * exception table, not each individual entry. + */ +#ifdef __ASSEMBLY__ +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ASM_VDSO_EXTABLE_HANDLE from to + +.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req + .pushsection __ex_table, "a" + .long (\from) - __ex_table + .long (\to) - __ex_table + .popsection +.endm +#else +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ".pushsection __ex_table, \"a\"\n" \ + ".long (" #from ") - __ex_table\n" \ + ".long (" #to ") - __ex_table\n" \ + ".popsection\n" +#endif + +#endif /* __VDSO_EXTABLE_H */ diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4d152933547d..dc8da7695859 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -75,11 +75,18 @@ SECTIONS * stuff that isn't used at runtime in between. */ - .text : { *(.text*) } :text =0x90909090, + .text : { + *(.text*) + *(.fixup) + } :text =0x90909090, + + .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text + __ex_table : { *(__ex_table) } :text + /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 36b644e16272..4bf48462fca7 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -27,6 +27,7 @@ VERSION { __vdso_time; clock_getres; __vdso_clock_getres; + __vdso_sgx_enter_enclave; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 7380908045c7..2d0f3d8bcc25 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,8 @@ struct vdso_sym required_syms[] = { {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, + {"vdso32_rt_sigreturn_landing_pad", true}, + {"vdso32_sigreturn_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 6f46e11ce539..1c7cfac7e64a 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -5,6 +5,41 @@ * are built for 32-bit userspace. */ +static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", (int)(data)[i]); + } +} + + +/* + * Extract a section from the input data into a standalone blob. Used to + * capture kernel-only data that needs to persist indefinitely, e.g. the + * exception fixup tables, but only in the kernel, i.e. the section can + * be stripped from the final vDSO image. + */ +static void BITSFUNC(extract)(const unsigned char *data, size_t data_len, + FILE *outfile, ELF(Shdr) *sec, const char *name) +{ + unsigned long offset; + size_t len; + + offset = (unsigned long)GET_LE(&sec->sh_offset); + len = (size_t)GET_LE(&sec->sh_size); + + if (offset + len > data_len) + fail("section to extract overruns input data"); + + fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len); + BITSFUNC(copy)(outfile, data + offset, len); + fprintf(outfile, "\n};\n\n"); +} + static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, FILE *outfile, const char *image_name) @@ -15,7 +50,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, - *alt_sec = NULL; + *alt_sec = NULL, *extable_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; INT_BITS syms[NSYMS] = {}; @@ -77,6 +112,8 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (!strcmp(secstrings + GET_LE(&sh->sh_name), ".altinstructions")) alt_sec = sh; + if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table")) + extable_sec = sh; } if (!symtab_hdr) @@ -155,6 +192,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); + if (extable_sec) + BITSFUNC(extract)(raw_addr, raw_len, outfile, + extable_sec, "extable"); fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); @@ -165,6 +205,14 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "\t.alt_len = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_size)); } + if (extable_sec) { + fprintf(outfile, "\t.extable_base = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_offset)); + fprintf(outfile, "\t.extable_len = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_size)); + fprintf(outfile, "\t.extable = extable,\n"); + } + for (i = 0; i < NSYMS; i++) { if (required_syms[i].export && syms[i]) fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index c3233ee98a6b..1bd068f72d4c 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -18,6 +18,7 @@ __kernel_sigreturn: movl $__NR_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_sigreturn: +SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_sigreturn,.-.LSTART_sigreturn @@ -29,6 +30,7 @@ __kernel_rt_sigreturn: movl $__NR_rt_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: +SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 9185cb1d13b9..825e829ffff1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -89,30 +89,14 @@ static void vdso_fix_landing(const struct vdso_image *image, static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; const struct vdso_image *image = current->mm->context.vdso_image; - if (image->size != new_size) - return -EINVAL; - vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; } -static int vvar_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - const struct vdso_image *image = new_vma->vm_mm->context.vdso_image; - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - - if (new_size != -image->sym_vvar_start) - return -EINVAL; - - return 0; -} - #ifdef CONFIG_TIME_NS static struct page *find_timens_vvar_page(struct vm_area_struct *vma) { @@ -252,7 +236,6 @@ static const struct vm_special_mapping vdso_mapping = { static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, - .mremap = vvar_mremap, }; /* @@ -413,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) + int uses_interp, bool x32) { #ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) { + if (x32) { if (!vdso64_enabled) return 0; return map_vdso_randomized(&vdso_image_x32); @@ -436,6 +419,21 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif +bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + const struct vdso_image *image = current->mm->context.vdso_image; + unsigned long vdso = (unsigned long) current->mm->context.vdso; + + if (in_ia32_syscall() && image == &vdso_image_32) { + if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || + regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) + return true; + } +#endif + return false; +} + #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S new file mode 100644 index 000000000000..86a0e94f68df --- /dev/null +++ b/arch/x86/entry/vdso/vsgx.S @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/export.h> +#include <asm/errno.h> +#include <asm/enclu.h> + +#include "extable.h" + +/* Relative to %rbp. */ +#define SGX_ENCLAVE_OFFSET_OF_RUN 16 + +/* The offsets relative to struct sgx_enclave_run. */ +#define SGX_ENCLAVE_RUN_TCS 0 +#define SGX_ENCLAVE_RUN_LEAF 8 +#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12 +#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14 +#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16 +#define SGX_ENCLAVE_RUN_USER_HANDLER 24 +#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */ +#define SGX_ENCLAVE_RUN_RESERVED_START 40 +#define SGX_ENCLAVE_RUN_RESERVED_END 256 + +.code64 +.section .text, "ax" + +SYM_FUNC_START(__vdso_sgx_enter_enclave) + /* Prolog */ + .cfi_startproc + push %rbp + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbp, 0 + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + push %rbx + .cfi_rel_offset %rbx, -8 + + mov %ecx, %eax +.Lenter_enclave: + /* EENTER <= function <= ERESUME */ + cmp $EENTER, %eax + jb .Linvalid_input + cmp $ERESUME, %eax + ja .Linvalid_input + + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx + + /* Validate that the reserved area contains only zeros. */ + mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx +1: + cmpq $0, (%rcx, %rbx) + jne .Linvalid_input + add $8, %rbx + cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx + jne 1b + + /* Load TCS and AEP */ + mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx + lea .Lasync_exit_pointer(%rip), %rcx + + /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ +.Lasync_exit_pointer: +.Lenclu_eenter_eresume: + enclu + + /* EEXIT jumps here unless the enclave is doing something fancy. */ + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set exit_reason. */ + movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx) + + /* Invoke userspace's exit handler if one was provided. */ +.Lhandle_exit: + cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx) + jne .Linvoke_userspace_handler + + /* Success, in the sense that ENCLU was attempted. */ + xor %eax, %eax + +.Lout: + pop %rbx + leave + .cfi_def_cfa %rsp, 8 + ret + + /* The out-of-line code runs with the pre-leave stack frame. */ + .cfi_def_cfa %rbp, 16 + +.Linvalid_input: + mov $(-EINVAL), %eax + jmp .Lout + +.Lhandle_exception: + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set the exception info. */ + mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx) + mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx) + mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx) + mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx) + jmp .Lhandle_exit + +.Linvoke_userspace_handler: + /* Pass the untrusted RSP (at exit) to the callback via %rcx. */ + mov %rsp, %rcx + + /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */ + mov %rbx, %rax + + /* Save the untrusted RSP offset in %rbx (non-volatile register). */ + mov %rsp, %rbx + and $0xf, %rbx + + /* + * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned + * _after_ pushing the parameters on the stack, hence the bonus push. + */ + and $-0x10, %rsp + push %rax + + /* Push struct sgx_enclave_exception as a param to the callback. */ + push %rax + + /* Clear RFLAGS.DF per x86_64 ABI */ + cld + + /* + * Load the callback pointer to %rax and lfence for LVI (load value + * injection) protection before making the call. + */ + mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax + lfence + call *%rax + + /* Undo the post-exit %rsp adjustment. */ + lea 0x10(%rsp, %rbx), %rsp + + /* + * If the return from callback is zero or negative, return immediately, + * else re-execute ENCLU with the postive return value interpreted as + * the requested ENCLU function. + */ + cmp $0, %eax + jle .Lout + jmp .Lenter_enclave + + .cfi_endproc + +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception) + +SYM_FUNC_END(__vdso_sgx_enter_enclave) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 44c33103a955..1b40b9297083 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -316,7 +316,7 @@ static struct vm_area_struct gate_vma __ro_after_init = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT - if (!mm || mm->context.ia32_compat) + if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL)) return NULL; #endif if (vsyscall_mode == NONE) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 39eb276d0277..2c1791c4a518 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -538,7 +538,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = amd_get_nb_id(cpu); + nb_id = topology_die_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a88c94d65693..18df17129695 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,6 +81,12 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -253,6 +259,8 @@ static bool check_hw_exists(void) if (ret) goto msr_fail; for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (fixed_counter_disabled(i)) + continue; if (val & (0x03 << i*4)) { bios_fail = 1; val_fail = val; @@ -665,6 +673,12 @@ void x86_pmu_disable_all(void) } } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + return static_call(x86_pmu_guest_get_msrs)(nr); +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. @@ -1174,7 +1188,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; - /* fall through */ + fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + @@ -1523,6 +1537,8 @@ void perf_event_print_debug(void) cpu, idx, prev_left); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1923,6 +1939,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); + + static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); } static void _x86_pmu_read(struct perf_event *event) @@ -1995,12 +2013,17 @@ static int __init init_hw_perf_events(void) pr_info("... generic registers: %d\n", x86_pmu.num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; + if (!x86_pmu.guest_get_msrs) + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + x86_pmu_static_call_update(); /* @@ -2602,7 +2625,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; - if (!test_thread_flag(TIF_IA32)) + if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index af457f8cb29d..37ce38403cb8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ - INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), @@ -274,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_spr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_spr_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_EVENT_CONSTRAINT(0x2e, 0xff), + INTEL_EVENT_CONSTRAINT(0x3c, 0xff), + /* + * Generally event codes < 0x90 are restricted to counters 0-3. + * The 0x2E and 0x3C are exception, which has no restriction. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), + /* + * Generally event codes >= 0x90 are likely to have no restrictions. + * The exception are defined as above. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff), + + EVENT_CONSTRAINT_END +}; + + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -313,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); -EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); -EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); -EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); -EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); -EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); +EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84"); +EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85"); +EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86"); +EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87"); static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), @@ -383,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 spr_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe124, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_MISS) ] = 0xe424, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe12, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + [ C(RESULT_MISS) ] = 0xe13, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = 0xe11, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4c4, + [ C(RESULT_MISS) ] = 0x4c5, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, +}; + +static __initconst const u64 spr_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10001, + [ C(RESULT_MISS) ] = 0x3fbfc00001, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x3f3ffc0002, + [ C(RESULT_MISS) ] = 0x3f3fc00002, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10c000001, + [ C(RESULT_MISS) ] = 0x3fb3000001, + }, + }, +}; + /* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) @@ -1900,6 +2056,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0"); + +static struct attribute *tnt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_tnt), + EVENT_PTR(td_bad_spec_tnt), + EVENT_PTR(td_be_bound_tnt), + NULL, +}; + static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), @@ -2120,18 +2289,6 @@ static void intel_tfa_pmu_enable_all(int added) intel_pmu_enable_all(added); } -static void enable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() | - DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - -static void disable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() & - ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2323,8 +2480,8 @@ static void __icl_update_topdown_event(struct perf_event *event, } } -static void update_saved_topdown_regs(struct perf_event *event, - u64 slots, u64 metrics) +static void update_saved_topdown_regs(struct perf_event *event, u64 slots, + u64 metrics, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2333,7 +2490,7 @@ static void update_saved_topdown_regs(struct perf_event *event, event->hw.saved_slots = slots; event->hw.saved_metric = metrics; - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2348,7 +2505,8 @@ static void update_saved_topdown_regs(struct perf_event *event, * The PERF_METRICS and Fixed counter 3 are read separately. The values may be * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 icl_update_topdown_event(struct perf_event *event) + +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2364,7 +2522,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) /* read PERF_METRICS */ rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2390,7 +2548,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) * Don't need to reset the PERF_METRICS and Fixed counter 3. * Because the values will be restored in next schedule in. */ - update_saved_topdown_regs(event, slots, metrics); + update_saved_topdown_regs(event, slots, metrics, metric_end); reset = false; } @@ -2399,12 +2557,18 @@ static u64 icl_update_topdown_event(struct perf_event *event) wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); wrmsrl(MSR_PERF_METRICS, 0); if (event) - update_saved_topdown_regs(event, 0, 0); + update_saved_topdown_regs(event, 0, 0, metric_end); } return slots; } +static u64 icl_update_topdown_event(struct perf_event *event) +{ + return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + + x86_pmu.num_topdown_events - 1); +} + static void intel_pmu_read_topdown_event(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2559,8 +2723,11 @@ static void intel_pmu_reset(void) wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } if (ds) ds->bts_index = ds->bts_buffer_base; @@ -2695,95 +2862,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing = true; -static int __init intel_perf_counter_freezing_setup(char *s) -{ - bool res; - - if (kstrtobool(s, &res)) - return -EINVAL; - - disable_counter_freezing = !res; - return 1; -} -__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); - -/* - * Simplified handler for Arch Perfmon v4: - * - We rely on counter freezing/unfreezing to enable/disable the PMU. - * This is done automatically on PMU ack. - * - Ack the PMU only after the APIC. - */ - -static int intel_pmu_handle_irq_v4(struct pt_regs *regs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int handled = 0; - bool bts = false; - u64 status; - int pmu_enabled = cpuc->enabled; - int loops = 0; - - /* PMU has been disabled because of counter freezing */ - cpuc->enabled = 0; - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - bts = true; - intel_bts_disable_local(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - } - status = intel_pmu_get_status(); - if (!status) - goto done; -again: - intel_pmu_lbr_read(); - if (++loops > 100) { - static bool warned; - - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } - - - handled += handle_pmi_common(regs, status); -done: - /* Ack the PMI in the APIC */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* - * The counters start counting immediately while ack the status. - * Make it as close as possible to IRET. This avoids bogus - * freezing on Skylake CPUs. - */ - if (status) { - intel_pmu_ack_status(status); - } else { - /* - * CPU may issues two PMIs very close to each other. - * When the PMI handler services the first one, the - * GLOBAL_STATUS is already updated to reflect both. - * When it IRETs, the second PMI is immediately - * handled and it sees clear status. At the meantime, - * there may be a third PMI, because the freezing bit - * isn't set since the ack in first PMI handlers. - * Double check if there is more work to be done. - */ - status = intel_pmu_get_status(); - if (status) - goto again; - } - - if (bts) - intel_bts_enable_local(); - cpuc->enabled = pmu_enabled; - return handled; -} - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -3549,6 +3627,26 @@ static int core_pmu_hw_config(struct perf_event *event) return intel_pmu_bts_config(event); } +#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \ + ((x86_pmu.num_topdown_events - 1) << 8)) + +static bool is_available_metric_event(struct perf_event *event) +{ + return is_metric_event(event) && + event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX; +} + +static inline bool is_mem_loads_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01); +} + +static inline bool is_mem_loads_aux_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); +} + + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3561,11 +3659,16 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3578,6 +3681,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice @@ -3622,7 +3726,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.config & X86_ALL_EVENT_FLAGS) return -EINVAL; - if (is_metric_event(event)) { + if (is_available_metric_event(event)) { struct perf_event *leader = event->group_leader; /* The metric events don't support sampling. */ @@ -3651,6 +3755,33 @@ static int intel_pmu_hw_config(struct perf_event *event) } } + /* + * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR + * doesn't function quite right. As a work-around it needs to always be + * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82). + * The actual count of this second event is irrelevant it just needs + * to be active to make the first event function correctly. + * + * In a group, the auxiliary event must be in front of the load latency + * event. The rule is to simplify the implementation of the check. + * That's because perf cannot have a complete group at the moment. + */ + if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX && + (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) && + is_mem_loads_event(event)) { + struct perf_event *leader = event->group_leader; + struct perf_event *sibling = NULL; + + if (!is_mem_loads_aux_event(leader)) { + for_each_sibling_event(sibling, leader) { + if (is_mem_loads_aux_event(sibling)) + break; + } + if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list)) + return -ENODATA; + } + } + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0; @@ -3666,26 +3797,6 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -#ifdef CONFIG_RETPOLINE -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr); -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr); -#endif - -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ -#ifdef CONFIG_RETPOLINE - if (x86_pmu.guest_get_msrs == intel_guest_get_msrs) - return intel_guest_get_msrs(nr); - else if (x86_pmu.guest_get_msrs == core_guest_get_msrs) - return core_guest_get_msrs(nr); -#endif - if (x86_pmu.guest_get_msrs) - return x86_pmu.guest_get_msrs(nr); - *nr = 0; - return NULL; -} -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); - static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -3851,6 +3962,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } static struct event_constraint * +spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + c = icl_get_event_constraints(cpuc, idx, event); + + /* + * The :ppp indicates the Precise Distribution (PDist) facility, which + * is only supported on the GP counter 0. If a :ppp event which is not + * available on the GP counter 0, error out. + */ + if (event->attr.precise_ip == 3) { + if (c->idxmsk64 & BIT_ULL(0)) + return &counter0_constraint; + + return &emptyconstraint; + } + + return c; +} + +static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { @@ -3939,6 +4073,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left) return max(left, 32ULL); } +static u64 spr_limit_period(struct perf_event *event, u64 left) +{ + if (event->attr.precise_ip == 3) + return max(left, 128ULL); + + return left; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -4080,9 +4222,6 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); - if (x86_pmu.counter_freezing) - enable_counter_freeze(); - /* Disable perf metrics if any added CPU doesn't support it. */ if (x86_pmu.intel_cap.perf_metrics) { union perf_capabilities perf_cap; @@ -4153,9 +4292,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); - - if (x86_pmu.counter_freezing) - disable_counter_freeze(); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -4383,6 +4519,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), @@ -4547,39 +4686,6 @@ static __init void intel_nehalem_quirk(void) } } -static const struct x86_cpu_desc counter_freezing_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), - {} -}; - -static bool intel_counter_freezing_broken(void) -{ - return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes); -} - -static __init void intel_counter_freezing_quirk(void) -{ - /* Check if it's already disabled */ - if (disable_counter_freezing) - return; - - /* - * If the system starts with the wrong ucode, leave the - * counter-freezing feature permanently disabled. - */ - if (intel_counter_freezing_broken()) { - pr_info("PMU counter freezing disabled due to CPU errata," - "please upgrade microcode\n"); - x86_pmu.counter_freezing = false; - x86_pmu.handle_irq = intel_pmu_handle_irq; - } -} - /* * enable software workaround for errata: * SNB: BJ122 @@ -4689,6 +4795,42 @@ static struct attribute *icl_tsx_events_attrs[] = { NULL, }; + +EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82"); + +static struct attribute *spr_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_spr), + EVENT_PTR(mem_ld_aux), + NULL, +}; + +static struct attribute *spr_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + EVENT_PTR(td_heavy_ops), + EVENT_PTR(td_br_mispredict), + EVENT_PTR(td_fetch_lat), + EVENT_PTR(td_mem_bound), + NULL, +}; + +static struct attribute *spr_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4912,7 +5054,7 @@ __init int intel_pmu_init(void) union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; - unsigned int unused; + unsigned int fixed_mask; struct extra_reg *er; bool pmem = false; int version, i; @@ -4934,7 +5076,7 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; @@ -4958,15 +5100,15 @@ __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) { + if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, assume); - } - if (version >= 4) - x86_pmu.counter_freezing = !disable_counter_freezing; + fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + } else if (version >= 5) + x86_pmu.num_counters_fixed = fls(fixed_mask); if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -5095,7 +5237,6 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_D: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -5122,7 +5263,6 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -5173,6 +5313,7 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.get_event_constraints = tnt_get_event_constraints; + td_attr = tnt_events_attrs; extra_attr = slm_format_attr; pr_cont("Tremont events, "); name = "Tremont"; @@ -5442,6 +5583,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -5464,15 +5606,53 @@ __init int intel_pmu_init(void) mem_attr = icl_events_attrs; td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; pr_cont("Icelake events, "); name = "icelake"; break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + pmem = true; + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + x86_pmu.event_constraints = intel_spr_event_constraints; + x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; + x86_pmu.extra_regs = intel_spr_extra_regs; + x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = spr_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_skl_attr = skl_format_attr; + mem_attr = spr_events_attrs; + td_attr = spr_td_events_attrs; + tsx_attr = spr_tsx_events_attrs; + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 8; + x86_pmu.update_topdown_event = icl_update_topdown_event; + x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + pr_cont("Sapphire Rapids events, "); + name = "sapphire_rapids"; + break; + default: switch (x86_pmu.version) { case 1: @@ -5515,8 +5695,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; } - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED; /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated) @@ -5533,13 +5712,22 @@ __init int intel_pmu_init(void) * events to the generic counters. */ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* + * Disable topdown slots and metrics events, + * if slots event is not in CPUID. + */ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl)) + c->idxmsk64 = 0; c->weight = hweight64(c->idxmsk64); continue; } - if (c->cmask == FIXED_EVENT_FLAGS - && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= x86_pmu.intel_ctrl; + + if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } c->idxmsk64 &= ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); @@ -5585,13 +5773,6 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } - /* - * For arch perfmon 4 use counter freezing to avoid - * several MSR accesses in the PMI. - */ - if (x86_pmu.counter_freezing) - x86_pmu.handle_irq = intel_pmu_handle_irq_v4; - if (x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4eb7ee5fed72..407eee5f6f95 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -51,46 +51,46 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL + * ICL,TGL,RKL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT + * KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * */ @@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 485c5066f8b8..d32b302719fe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -36,7 +36,9 @@ union intel_x86_pebs_dse { unsigned int ld_dse:4; unsigned int ld_stlb_miss:1; unsigned int ld_locked:1; - unsigned int ld_reserved:26; + unsigned int ld_data_blk:1; + unsigned int ld_addr_blk:1; + unsigned int ld_reserved:24; }; struct { unsigned int st_l1d_hit:1; @@ -45,6 +47,12 @@ union intel_x86_pebs_dse { unsigned int st_locked:1; unsigned int st_reserved2:26; }; + struct { + unsigned int st_lat_dse:4; + unsigned int st_lat_stlb_miss:1; + unsigned int st_lat_locked:1; + unsigned int ld_reserved3:26; + }; }; @@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status) if (dse.ld_locked) val |= P(LOCK, LOCKED); + /* + * Ice Lake and earlier models do not support block infos. + */ + if (!x86_pmu.pebs_block) { + val |= P(BLK, NA); + return val; + } + /* + * bit 6: load was blocked since its data could not be forwarded + * from a preceding store + */ + if (dse.ld_data_blk) + val |= P(BLK, DATA); + + /* + * bit 7: load was blocked due to potential address conflict with + * a preceding store + */ + if (dse.ld_addr_blk) + val |= P(BLK, ADDR); + + if (!dse.ld_data_blk && !dse.ld_addr_blk) + val |= P(BLK, NA); + + return val; +} + +static u64 store_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.st_lat_dse]; + + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.st_lat_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.st_lat_locked) + val |= P(LOCK, LOCKED); + + val |= P(BLK, NA); + return val; } @@ -870,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_spr_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), + INTEL_PLD_CONSTRAINT(0x1cd, 0xfe), + INTEL_PSD_CONSTRAINT(0x2cd, 0x1), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -960,8 +1047,10 @@ static void adaptive_pebs_record_size_update(void) } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE | \ + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -986,7 +1075,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1261,7 +1350,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) old_to = to; #ifdef CONFIG_X86_64 - is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); + is_64bit = kernel_ip(to) || any_64bit_mode(regs); #endif insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); @@ -1330,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux) if (fl & PERF_X86_EVENT_PEBS_LDLAT) val = load_latency_data(aux); + else if (fl & PERF_X86_EVENT_PEBS_STLAT) + val = store_latency_data(aux); else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) val = precise_datala_hsw(event, aux); else if (fst) @@ -1337,6 +1428,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1364,8 +1459,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + data->weight.full = pebs->lat; /* * data.data_src encodes the data source @@ -1451,14 +1546,14 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_get_tsx_weight(pebs->tsx_tuning); + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, @@ -1502,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +#define PEBS_LATENCY_MASK 0xffff +#define PEBS_CACHE_LATENCY_OFFSET 32 + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -1572,14 +1670,32 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT) - data->weight = meminfo->latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 weight = meminfo->latency; + + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { + data->weight.var2_w = weight & PEBS_LATENCY_MASK; + weight >>= PEBS_CACHE_LATENCY_OFFSET; + } + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = weight ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } else { + data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } + } if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; if (sample_type & PERF_SAMPLE_TRANSACTION) @@ -1894,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8961653c5dd2..21890dacfcfe 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info) return !(info & LBR_INFO_MISPRED); } -static __always_inline bool get_lbr_cycles(u64 info) +static __always_inline u16 get_lbr_cycles(u64 info) { if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) @@ -1221,7 +1221,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * on 64-bit systems running 32-bit apps */ #ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 80d52cbe2fde..33c8180d5a87 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty = MODULE_LICENSE("GPL"); -int uncore_pcibus_to_physid(struct pci_bus *bus) +int uncore_pcibus_to_dieid(struct pci_bus *bus) { struct pci2phy_map *map; - int phys_id = -1; + int die_id = -1; raw_spin_lock(&pci2phy_map_lock); list_for_each_entry(map, &pci2phy_map_head, list) { if (map->segment == pci_domain_nr(bus)) { - phys_id = map->pbus_to_physid[bus->number]; + die_id = map->pbus_to_dieid[bus->number]; break; } } raw_spin_unlock(&pci2phy_map_lock); - return phys_id; + return die_id; } static void uncore_free_pcibus_map(void) @@ -86,7 +86,7 @@ lookup: alloc = NULL; map->segment = segment; for (i = 0; i < 256; i++) - map->pbus_to_physid[i] = -1; + map->pbus_to_dieid[i] = -1; list_add_tail(&map->list, &pci2phy_map_head); end: @@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, uncore_pmu_init_hrtimer(box); box->cpu = -1; - box->pci_phys_id = -1; box->dieid = -1; /* set default hrtimer timeout */ @@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) /* * Get the die information of a PCI device. * @pdev: The PCI device. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ -static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, - int *phys_id, int *die) +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die) { - *phys_id = uncore_pcibus_to_physid(pdev->bus); - if (*phys_id < 0) - return -ENODEV; - - *die = (topology_max_die_per_package() > 1) ? *phys_id : - topology_phys_to_logical_pkg(*phys_id); + *die = uncore_pcibus_to_dieid(pdev->bus); if (*die < 0) return -EINVAL; @@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) * @pdev: The PCI device. * @type: The corresponding PMU type of the device. * @pmu: The corresponding PMU of the device. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ static int uncore_pci_pmu_register(struct pci_dev *pdev, struct intel_uncore_type *type, struct intel_uncore_pmu *pmu, - int phys_id, int die) + int die) { struct intel_uncore_box *box; int ret; @@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, WARN_ON_ONCE(pmu->func_id != pdev->devfn); atomic_inc(&box->refcnt); - box->pci_phys_id = phys_id; box->dieid = die; box->pci_dev = pdev; box->pmu = pmu; @@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu = NULL; - int phys_id, die, ret; + int die, ret; - ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); + ret = uncore_pci_get_dev_die_info(pdev, &die); if (ret) return ret; @@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } - ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die); + ret = uncore_pci_pmu_register(pdev, type, pmu, die); pci_set_drvdata(pdev, pmu->boxes[die]); @@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id /* * Unregister the PMU of a PCI device * @pmu: The corresponding PMU is unregistered. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ -static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, - int phys_id, int die) +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die) { struct intel_uncore_box *box = pmu->boxes[die]; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; - pmu->boxes[die] = NULL; if (atomic_dec_return(&pmu->activeboxes) == 0) uncore_pmu_unregister(pmu); @@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; - int i, phys_id, die; + int i, die; - if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pdev, &die)) return; box = pci_get_drvdata(pdev); @@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - uncore_pci_pmu_unregister(pmu, phys_id, die); + uncore_pci_pmu_unregister(pmu, die); } static int uncore_bus_notify(struct notifier_block *nb, @@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb, struct device *dev = data; struct pci_dev *pdev = to_pci_dev(dev); struct intel_uncore_pmu *pmu; - int phys_id, die; + int die; /* Unregister the PMU when the device is going to be deleted. */ if (action != BUS_NOTIFY_DEL_DEVICE) @@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb, if (!pmu) return NOTIFY_DONE; - if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pdev, &die)) return NOTIFY_DONE; - uncore_pci_pmu_unregister(pmu, phys_id, die); + uncore_pci_pmu_unregister(pmu, die); return NOTIFY_OK; } @@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void) struct pci_dev *pci_sub_dev; bool notify = false; unsigned int devfn; - int phys_id, die; + int die; while (ids && ids->vendor) { pci_sub_dev = NULL; @@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void) if (!pmu) continue; - if (uncore_pci_get_dev_die_info(pci_sub_dev, - &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) continue; if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, - phys_id, die)) + die)) notify = true; } ids++; @@ -1636,6 +1620,11 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { .mmio_init = tgl_l_uncore_mmio_init, }; +static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1683,6 +1672,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 9efea154349d..a3c6e1643ad2 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -124,7 +124,6 @@ struct intel_uncore_extra_reg { }; struct intel_uncore_box { - int pci_phys_id; int dieid; /* Logical die ID */ int n_active; /* number of active events */ int n_events; @@ -173,11 +172,11 @@ struct freerunning_counters { struct pci2phy_map { struct list_head list; int segment; - int pbus_to_physid[256]; + int pbus_to_dieid[256]; }; struct pci2phy_map *__find_pci2phy_map(int segment); -int uncore_pcibus_to_physid(struct pci_bus *bus); +int uncore_pcibus_to_dieid(struct pci_bus *bus); ssize_t uncore_event_show(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index bbd1120ae161..51271288499e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -60,7 +60,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 #define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 - +#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 +#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = { NULL, }; +static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + void tgl_uncore_cpu_init(void) { uncore_msr_uncores = tgl_msr_uncores; @@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void) icl_uncore_cbox.ops = &skl_uncore_msr_ops; icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; + skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; } enum { @@ -649,7 +657,7 @@ int snb_pci2phy_map_init(int devid) pci_dev_put(dev); return -ENOMEM; } - map->pbus_to_physid[bus] = 0; + map->pbus_to_dieid[bus] = 0; raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); @@ -926,6 +934,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -1019,6 +1035,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver), + IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver), { /* end marker */ } }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 7bdb1821215d..b79951d0707c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid, segment; + int i, bus, nodeid, segment, die_id; struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1370,36 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool if (!ubox_dev) break; bus = ubox_dev->bus->number; - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); - if (err) - break; - nodeid = config & NODE_ID_MASK; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, idmap_loc, &config); - if (err) - break; + /* + * The nodeid and idmap registers only contain enough + * information to handle 8 nodes. On systems with more + * than 8 nodes, we need to rely on NUMA information, + * filled in from BIOS supplied information, to determine + * the topology. + */ + if (nr_node_ids <= 8) { + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); + if (err) + break; + nodeid = config & NODE_ID_MASK; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, idmap_loc, &config); + if (err) + break; - segment = pci_domain_nr(ubox_dev->bus); - raw_spin_lock(&pci2phy_map_lock); - map = __find_pci2phy_map(segment); - if (!map) { + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + map->pbus_to_dieid[bus] = die_id; + break; + } + } raw_spin_unlock(&pci2phy_map_lock); - err = -ENOMEM; - break; - } + } else { + int node = pcibus_to_node(ubox_dev->bus); + int cpu; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - map->pbus_to_physid[bus] = i; + die_id = -1; + for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) { + map->pbus_to_dieid[bus] = die_id = c->logical_die_id; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + if (WARN_ON_ONCE(die_id == -1)) { + err = -EINVAL; break; } } - raw_spin_unlock(&pci2phy_map_lock); } if (!err) { @@ -1412,17 +1453,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool i = -1; if (reverse) { for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] >= 0) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } else { for (bus = 0; bus <= 255; bus++) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] >= 0) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } } @@ -4646,19 +4687,14 @@ int snr_uncore_pci_init(void) static struct pci_dev *snr_uncore_get_mc_dev(int id) { struct pci_dev *mc_dev = NULL; - int phys_id, pkg; + int pkg; while (1) { mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev); if (!mc_dev) break; - phys_id = uncore_pcibus_to_physid(mc_dev->bus); - if (phys_id < 0) - continue; - pkg = topology_phys_to_logical_pkg(phys_id); - if (pkg < 0) - continue; - else if (pkg == id) + pkg = uncore_pcibus_to_dieid(mc_dev->bus); + if (pkg == id) break; } return mc_dev; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4be8f9cabd07..680404c58cb1 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6a8edfe59b09..53b2b5fc23bc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ #define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -132,7 +133,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ @@ -443,6 +444,10 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) +#define INTEL_PSD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT) + #define INTEL_PST_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) @@ -682,8 +687,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, - enabled_ack :1, - counter_freezing :1; + enabled_ack :1; /* * sysfs attrs */ @@ -724,7 +728,8 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_block :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; @@ -776,6 +781,7 @@ struct x86_pmu { /* * Intel perf metrics */ + int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); int (*set_topdown_event_period)(struct perf_event *event); @@ -871,6 +877,8 @@ do { \ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define PMU_FL_TFA 0x20 /* deal with TSX force abort */ #define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */ +#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */ +#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1060,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +static inline bool fixed_counter_disabled(int i) +{ + return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); @@ -1157,6 +1170,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; extern struct event_constraint intel_icl_pebs_event_constraints[]; +extern struct event_constraint intel_spr_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index 136a1e847254..600bf8d15c0c 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) for (bit = 0; bit < cnt; bit++) { if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + u64 mask; /* skip entry with no group */ if (!grp) @@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ if (rdmsrl_safe(msr[bit].msr, &val)) continue; + + mask = msr[bit].mask; + if (!mask) + mask = ~0ULL; /* Disable zero counters if requested. */ - if (!zero && !val) + if (!zero && !(val & mask)) continue; grp->is_visible = NULL; diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h index 4c8e0afc5fb5..261b9bda24e3 100644 --- a/arch/x86/events/probe.h +++ b/arch/x86/events/probe.h @@ -4,10 +4,11 @@ #include <linux/sysfs.h> struct perf_msr { - u64 msr; - struct attribute_group *grp; + u64 msr; + struct attribute_group *grp; bool (*test)(int idx, void *data); - bool no_check; + bool no_check; + u64 mask; }; unsigned long diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 7dbbeaacd995..f42a70496a24 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -454,16 +454,9 @@ static struct attribute *rapl_events_cores[] = { NULL, }; -static umode_t -rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - return 0; -} - static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -476,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -489,7 +481,6 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -502,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -515,7 +505,6 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, - .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) @@ -523,12 +512,23 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } +/* Only lower 32bits of the MSR represents the energy counter */ +#define RAPL_MSR_MASK 0xFFFFFFFF + static struct perf_msr intel_rapl_msrs[] = { - [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, - [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, - [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, - [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, - [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, +}; + +static struct perf_msr intel_rapl_spr_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK }, }; /* @@ -761,7 +761,7 @@ static struct rapl_model model_spr = { BIT(PERF_RAPL_PSYS), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_msrs = intel_rapl_spr_msrs, }; static struct rapl_model model_amd_fam17h = { diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 89b1f74d3225..48e2c51464e8 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y := hv_init.o mmu.o nested.o -obj-$(CONFIG_X86_64) += hv_apic.o +obj-y := hv_init.o mmu.o nested.o irqdomain.o +obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e04d90af4c27..b81047dec1da 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -10,12 +10,14 @@ #include <linux/acpi.h> #include <linux/efi.h> #include <linux/types.h> +#include <linux/bitfield.h> #include <asm/apic.h> #include <asm/desc.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/idtentry.h> +#include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> @@ -25,6 +27,11 @@ #include <linux/cpuhotplug.h> #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> +#include <linux/highmem.h> + +int hyperv_init_cpuhp; +u64 hv_current_partition_id = ~0ull; +EXPORT_SYMBOL_GPL(hv_current_partition_id); void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); @@ -41,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page); void __percpu **hyperv_pcpu_input_arg; EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); +void __percpu **hyperv_pcpu_output_arg; +EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); + u32 hv_max_vp_index; EXPORT_SYMBOL_GPL(hv_max_vp_index); @@ -73,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu) void **input_arg; struct page *pg; - input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ - pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL); + pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0); if (unlikely(!pg)) return -ENOMEM; + + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); *input_arg = page_address(pg); + if (hv_root_partition) { + void **output_arg; + + output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *output_arg = page_address(pg + 1); + } hv_get_vp_index(msr_vp_index); @@ -205,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu) unsigned int new_cpu; unsigned long flags; void **input_arg; - void *input_pg = NULL; + void *pg; local_irq_save(flags); input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - input_pg = *input_arg; + pg = *input_arg; *input_arg = NULL; + + if (hv_root_partition) { + void **output_arg; + + output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *output_arg = NULL; + } + local_irq_restore(flags); - free_page((unsigned long)input_pg); + + free_pages((unsigned long)pg, hv_root_partition ? 1 : 0); if (hv_vp_assist_page && hv_vp_assist_page[cpu]) wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); @@ -261,6 +287,9 @@ static int hv_suspend(void) union hv_x64_msr_hypercall_contents hypercall_msr; int ret; + if (hv_root_partition) + return -EPERM; + /* * Reset the hypercall page as it is going to be invalidated * accross hibernation. Setting hv_hypercall_pg to NULL ensures @@ -312,6 +341,43 @@ static struct syscore_ops hv_syscore_ops = { .resume = hv_resume, }; +static void (* __initdata old_setup_percpu_clockev)(void); + +static void __init hv_stimer_setup_percpu_clockev(void) +{ + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + + /* + * Still register the LAPIC timer, because the direct-mode STIMER is + * not supported by old versions of Hyper-V. This also allows users + * to switch to LAPIC timer via /sys, if they want to. + */ + if (old_setup_percpu_clockev) + old_setup_percpu_clockev(); +} + +static void __init hv_get_partition_id(void) +{ + struct hv_get_partition_id *output_page; + u64 status; + unsigned long flags; + + local_irq_save(flags); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + /* No point in proceeding if this failed */ + pr_err("Failed to get partition ID: %lld\n", status); + BUG(); + } + hv_current_partition_id = output_page->partition_id; + local_irq_restore(flags); +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -346,6 +412,12 @@ void __init hyperv_init(void) BUG_ON(hyperv_pcpu_input_arg == NULL); + /* Allocate the per-CPU state for output arg for root */ + if (hv_root_partition) { + hyperv_pcpu_output_arg = alloc_percpu(void *); + BUG_ON(hyperv_pcpu_output_arg == NULL); + } + /* Allocate percpu VP index */ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), GFP_KERNEL); @@ -386,14 +458,45 @@ void __init hyperv_init(void) rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + if (hv_root_partition) { + struct page *pg; + void *src, *dst; + + /* + * For the root partition, the hypervisor will set up its + * hypercall page. The hypervisor guarantees it will not show + * up in the root's address space. The root can't change the + * location of the hypercall page. + * + * Order is important here. We must enable the hypercall page + * so it is populated with code, then copy the code to an + * executable page. + */ + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + pg = vmalloc_to_page(hv_hypercall_pg); + dst = kmap(pg); + src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, + MEMREMAP_WB); + BUG_ON(!(src && dst)); + memcpy(dst, src, HV_HYP_PAGE_SIZE); + memunmap(src); + kunmap(pg); + } else { + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + } /* - * Ignore any errors in setting up stimer clockevents - * as we can run with the LAPIC timer as a fallback. + * hyperv_init() is called before LAPIC is initialized: see + * apic_intr_mode_init() -> x86_platform.apic_post_init() and + * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER + * depends on LAPIC, so hv_stimer_alloc() should be called from + * x86_init.timers.setup_percpu_clockev. */ - (void)hv_stimer_alloc(); + old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; + x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; hv_apic_init(); @@ -401,6 +504,22 @@ void __init hyperv_init(void) register_syscore_ops(&hv_syscore_ops); + hyperv_init_cpuhp = cpuhp; + + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) + hv_get_partition_id(); + + BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); + +#ifdef CONFIG_PCI_MSI + /* + * If we're running as root, we want to create our own PCI MSI domain. + * We can't set this in hv_pci_init because that would be too late. + */ + if (hv_root_partition) + x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; +#endif + return; remove_cpuhp_state: @@ -525,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); bool hv_is_hibernation_supported(void) { - return acpi_sleep_state_supported(ACPI_STATE_S4); + return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.features_b & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); + +bool hv_is_isolation_supported(void) +{ + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; +} +EXPORT_SYMBOL_GPL(hv_is_isolation_supported); diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c new file mode 100644 index 000000000000..60461e598239 --- /dev/null +++ b/arch/x86/hyperv/hv_proc.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <linux/version.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/acpi.h> +#include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> +#include <linux/minmax.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> +#include <asm/apic.h> + +#include <asm/trace/hyperv.h> + +/* + * See struct hv_deposit_memory. The first u64 is partition ID, the rest + * are GPAs. + */ +#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1) + +/* Deposits exact number of pages. Must be called with interrupts enabled. */ +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + struct page **pages, *page; + int *counts; + int num_allocations; + int i, j, page_count; + int order; + u64 status; + int ret; + u64 base_pfn; + struct hv_deposit_memory *input_page; + unsigned long flags; + + if (num_pages > HV_DEPOSIT_MAX) + return -E2BIG; + if (!num_pages) + return 0; + + /* One buffer for page pointers and counts */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + pages = page_address(page); + + counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL); + if (!counts) { + free_page((unsigned long)pages); + return -ENOMEM; + } + + /* Allocate all the pages before disabling interrupts */ + i = 0; + + while (num_pages) { + /* Find highest order we can actually allocate */ + order = 31 - __builtin_clz(num_pages); + + while (1) { + pages[i] = alloc_pages_node(node, GFP_KERNEL, order); + if (pages[i]) + break; + if (!order) { + ret = -ENOMEM; + num_allocations = i; + goto err_free_allocations; + } + --order; + } + + split_page(pages[i], order); + counts[i] = 1 << order; + num_pages -= counts[i]; + i++; + } + num_allocations = i; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + + /* Populate gpa_page_list - these will fit on the input page */ + for (i = 0, page_count = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j, ++page_count) + input_page->gpa_page_list[page_count] = base_pfn + j; + } + status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, + page_count, 0, input_page, NULL); + local_irq_restore(flags); + + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + pr_err("Failed to deposit pages: %lld\n", status); + ret = status; + goto err_free_allocations; + } + + ret = 0; + goto free_buf; + +err_free_allocations: + for (i = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j) + __free_page(pfn_to_page(base_pfn + j)); + } + +free_buf: + free_page((unsigned long)pages); + kfree(counts); + return ret; +} + +int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) +{ + struct hv_add_logical_processor_in *input; + struct hv_add_logical_processor_out *output; + u64 status; + unsigned long flags; + int ret = 0; + int pxm = node_to_pxm(node); + + /* + * When adding a logical processor, the hypervisor may return + * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more + * pages and retry. + */ + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + /* We don't do anything with the output right now */ + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->lp_index = lp_index; + input->apic_id = apic_id; + input->flags = 0; + input->proximity_domain_info.domain_id = pxm; + input->proximity_domain_info.flags.reserved = 0; + input->proximity_domain_info.flags.proximity_info_valid = 1; + input->proximity_domain_info.flags.proximity_preferred = 1; + status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, + input, output); + local_irq_restore(flags); + + status &= HV_HYPERCALL_RESULT_MASK; + + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, + lp_index, apic_id, status); + ret = status; + } + break; + } + ret = hv_call_deposit_pages(node, hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + struct hv_create_vp *input; + u64 status; + unsigned long irq_flags; + int ret = 0; + int pxm = node_to_pxm(node); + + /* Root VPs don't seem to need pages deposited */ + if (partition_id != hv_current_partition_id) { + /* The value 90 is empirically determined. It may change. */ + ret = hv_call_deposit_pages(node, partition_id, 90); + if (ret) + return ret; + } + + do { + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->flags = flags; + input->subnode_type = HvSubnodeAny; + if (node != NUMA_NO_NODE) { + input->proximity_domain_info.domain_id = pxm; + input->proximity_domain_info.flags.reserved = 0; + input->proximity_domain_info.flags.proximity_info_valid = 1; + input->proximity_domain_info.flags.proximity_preferred = 1; + } else { + input->proximity_domain_info.as_uint64 = 0; + } + status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); + local_irq_restore(irq_flags); + + status &= HV_HYPERCALL_RESULT_MASK; + + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, + vp_index, flags, status); + ret = status; + } + break; + } + ret = hv_call_deposit_pages(node, partition_id, 1); + + } while (!ret); + + return ret; +} + diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c new file mode 100644 index 000000000000..4421a8d92e23 --- /dev/null +++ b/arch/x86/hyperv/irqdomain.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor. + * + * Authors: + * Sunil Muthuswamy <sunilmut@microsoft.com> + * Wei Liu <wei.liu@kernel.org> + */ + +#include <linux/pci.h> +#include <linux/irq.h> +#include <asm/mshyperv.h> + +static int hv_map_interrupt(union hv_device_id device_id, bool level, + int cpu, int vector, struct hv_interrupt_entry *entry) +{ + struct hv_input_map_device_interrupt *input; + struct hv_output_map_device_interrupt *output; + struct hv_device_interrupt_descriptor *intr_desc; + unsigned long flags; + u64 status; + int nr_bank, var_size; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + intr_desc = &input->interrupt_descriptor; + memset(input, 0, sizeof(*input)); + input->partition_id = hv_current_partition_id; + input->device_id = device_id.as_uint64; + intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; + intr_desc->vector_count = 1; + intr_desc->target.vector = vector; + + if (level) + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; + else + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; + + intr_desc->target.vp_set.valid_bank_mask = 0; + intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu)); + if (nr_bank < 0) { + local_irq_restore(flags); + pr_err("%s: unable to generate VP set\n", __func__); + return EINVAL; + } + intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; + + /* + * var-sized hypercall, var-size starts after vp_mask (thus + * vp_set.format does not count, but vp_set.valid_bank_mask + * does). + */ + var_size = nr_bank + 1; + + status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, + input, output); + *entry = output->interrupt_entry; + + local_irq_restore(flags); + + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + pr_err("%s: hypercall failed, status %lld\n", __func__, status); + + return status & HV_HYPERCALL_RESULT_MASK; +} + +static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) +{ + unsigned long flags; + struct hv_input_unmap_device_interrupt *input; + struct hv_interrupt_entry *intr_entry; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + intr_entry = &input->interrupt_entry; + input->partition_id = hv_current_partition_id; + input->device_id = id; + *intr_entry = *old_entry; + + status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); + local_irq_restore(flags); + + return status & HV_HYPERCALL_RESULT_MASK; +} + +#ifdef CONFIG_PCI_MSI +struct rid_data { + struct pci_dev *bridge; + u32 rid; +}; + +static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rid_data *rd = data; + u8 bus = PCI_BUS_NUM(rd->rid); + + if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { + rd->bridge = pdev; + rd->rid = alias; + } + + return 0; +} + +static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) +{ + union hv_device_id dev_id; + struct rid_data data = { + .bridge = NULL, + .rid = PCI_DEVID(dev->bus->number, dev->devfn) + }; + + pci_for_each_dma_alias(dev, get_rid_cb, &data); + + dev_id.as_uint64 = 0; + dev_id.device_type = HV_DEVICE_TYPE_PCI; + dev_id.pci.segment = pci_domain_nr(dev->bus); + + dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid); + dev_id.pci.bdf.device = PCI_SLOT(data.rid); + dev_id.pci.bdf.function = PCI_FUNC(data.rid); + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE; + + if (data.bridge) { + int pos; + + /* + * Microsoft Hypervisor requires a bus range when the bridge is + * running in PCI-X mode. + * + * To distinguish conventional vs PCI-X bridge, we can check + * the bridge's PCI-X Secondary Status Register, Secondary Bus + * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge + * Specification Revision 1.0 5.2.2.1.3. + * + * Value zero means it is in conventional mode, otherwise it is + * in PCI-X mode. + */ + + pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); + if (pos) { + u16 status; + + pci_read_config_word(data.bridge, pos + + PCI_X_BRIDGE_SSTATUS, &status); + + if (status & PCI_X_SSTATUS_FREQ) { + /* Non-zero, PCI-X mode */ + u8 sec_bus, sub_bus; + + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; + + pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus); + dev_id.pci.shadow_bus_range.secondary_bus = sec_bus; + pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus); + dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus; + } + } + } + + return dev_id; +} + +static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector, + struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id = hv_build_pci_dev_id(dev); + + return hv_map_interrupt(device_id, false, cpu, vector, entry); +} + +static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) +{ + /* High address is always 0 */ + msg->address_hi = 0; + msg->address_lo = entry->msi_entry.address.as_uint32; + msg->data = entry->msi_entry.data.as_uint32; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); +static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *msidesc; + struct pci_dev *dev; + struct hv_interrupt_entry out_entry, *stored_entry; + struct irq_cfg *cfg = irqd_cfg(data); + cpumask_t *affinity; + int cpu; + u64 status; + + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + + if (!cfg) { + pr_debug("%s: cfg is NULL", __func__); + return; + } + + affinity = irq_data_get_effective_affinity_mask(data); + cpu = cpumask_first_and(affinity, cpu_online_mask); + + if (data->chip_data) { + /* + * This interrupt is already mapped. Let's unmap first. + * + * We don't use retarget interrupt hypercalls here because + * Microsoft Hypervisor doens't allow root to change the vector + * or specify VPs outside of the set that is initially used + * during mapping. + */ + stored_entry = data->chip_data; + data->chip_data = NULL; + + status = hv_unmap_msi_interrupt(dev, stored_entry); + + kfree(stored_entry); + + if (status != HV_STATUS_SUCCESS) { + pr_debug("%s: failed to unmap, status %lld", __func__, status); + return; + } + } + + stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); + if (!stored_entry) { + pr_debug("%s: failed to allocate chip data\n", __func__); + return; + } + + status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry); + if (status != HV_STATUS_SUCCESS) { + kfree(stored_entry); + return; + } + + *stored_entry = out_entry; + data->chip_data = stored_entry; + entry_to_msi_msg(&out_entry, msg); + + return; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry) +{ + return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); +} + +static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +{ + u64 status; + struct hv_interrupt_entry old_entry; + struct irq_desc *desc; + struct irq_data *data; + struct msi_msg msg; + + desc = irq_to_desc(irq); + if (!desc) { + pr_debug("%s: no irq desc\n", __func__); + return; + } + + data = &desc->irq_data; + if (!data) { + pr_debug("%s: no irq data\n", __func__); + return; + } + + if (!data->chip_data) { + pr_debug("%s: no chip data\n!", __func__); + return; + } + + old_entry = *(struct hv_interrupt_entry *)data->chip_data; + entry_to_msi_msg(&old_entry, &msg); + + kfree(data->chip_data); + data->chip_data = NULL; + + status = hv_unmap_msi_interrupt(dev, &old_entry); + + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: hypercall failed, status %lld\n", __func__, status); + return; + } +} + +static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + int i; + struct msi_desc *entry; + struct pci_dev *pdev; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + pdev = to_pci_dev(dev); + + for_each_pci_msi_entry(entry, pdev) { + if (entry->irq) { + for (i = 0; i < entry->nvec_used; i++) { + hv_teardown_msi_irq_common(pdev, entry, entry->irq + i); + irq_domain_free_irqs(entry->irq + i, 1); + } + } + } +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip hv_pci_msi_controller = { + .name = "HV-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = hv_irq_compose_msi_msg, + .irq_set_affinity = msi_domain_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct msi_domain_ops pci_msi_domain_ops = { + .domain_free_irqs = hv_msi_domain_free_irqs, + .msi_prepare = pci_msi_prepare, +}; + +static struct msi_domain_info hv_pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &hv_pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +struct irq_domain * __init hv_create_pci_msi_domain(void) +{ + struct irq_domain *d = NULL; + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); + if (fn) + d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); + + /* No point in going further if we can't get an irq domain */ + BUG_ON(!d); + + return d; +} + +#endif /* CONFIG_PCI_MSI */ + +int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_unmap_interrupt(device_id.as_uint64, entry); +} +EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt); + +int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector, + struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_map_interrupt(device_id, level, cpu, vector, entry); +} +EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt); diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 5208ba49c89a..2c87350c1fb0 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -66,11 +66,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, if (!hv_hypercall_pg) goto do_native; - if (cpumask_empty(cpus)) - return; - local_irq_save(flags); + /* + * Only check the mask _after_ interrupt has been disabled to avoid the + * mask changing under our feet. + */ + if (cpumask_empty(cpus)) { + local_irq_restore(flags); + return; + } + flush_pcpu = (struct hv_tlb_flush **) this_cpu_ptr(hyperv_pcpu_input_arg); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 81cf22398cd1..5e3d9b7fd5fb 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -347,7 +347,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, */ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); + unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault); user_access_end(); if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 6d2df1ee427b..65064d9f7fa6 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -159,6 +159,8 @@ static inline u64 x86_default_get_root_pointer(void) extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ +struct cper_ia_proc_ctx; + #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -177,6 +179,15 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) */ return PAGE_KERNEL_NOENC; } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id); +#else +static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id) +{ + return -EINVAL; +} #endif #define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT) diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h new file mode 100644 index 000000000000..e003a01b7c67 --- /dev/null +++ b/arch/x86/include/asm/acrn.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ACRN_H +#define _ASM_X86_ACRN_H + +/* + * This CPUID returns feature bitmaps in EAX. + * Guest VM uses this to detect the appropriate feature bit. + */ +#define ACRN_CPUID_FEATURES 0x40000001 +/* Bit 0 indicates whether guest VM is privileged */ +#define ACRN_FEATURE_PRIVILEGED_VM BIT(0) + +void acrn_setup_intr_handler(void (*handler)(void)); +void acrn_remove_intr_handler(void); + +static inline u32 acrn_cpuid_base(void) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return hypervisor_cpuid_base("ACRNACRNACRN", 0); + + return 0; +} + +/* + * Hypercalls for ACRN + * + * - VMCALL instruction is used to implement ACRN hypercalls. + * - ACRN hypercall ABI: + * - Hypercall number is passed in R8 register. + * - Up to 2 arguments are passed in RDI, RSI. + * - Return value will be placed in RAX. + * + * Because GCC doesn't support R8 register as direct register constraints, use + * supported constraint as input with a explicit MOV to R8 in beginning of asm. + */ +static inline long acrn_hypercall0(unsigned long hcall_id) +{ + long result; + + asm volatile("movl %1, %%r8d\n\t" + "vmcall\n\t" + : "=a" (result) + : "g" (hcall_id) + : "r8", "memory"); + + return result; +} + +static inline long acrn_hypercall1(unsigned long hcall_id, + unsigned long param1) +{ + long result; + + asm volatile("movl %1, %%r8d\n\t" + "vmcall\n\t" + : "=a" (result) + : "g" (hcall_id), "D" (param1) + : "r8", "memory"); + + return result; +} + +static inline long acrn_hypercall2(unsigned long hcall_id, + unsigned long param1, + unsigned long param2) +{ + long result; + + asm volatile("movl %1, %%r8d\n\t" + "vmcall\n\t" + : "=a" (result) + : "g" (hcall_id), "D" (param1), "S" (param2) + : "r8", "memory"); + + return result; +} + +#endif /* _ASM_X86_ACRN_H */ diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h deleted file mode 100644 index 87ce8e963215..000000000000 --- a/arch/x86/include/asm/apb_timer.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare - * - * (C) Copyright 2009 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * Note: - */ - -#ifndef ASM_X86_APBT_H -#define ASM_X86_APBT_H -#include <linux/sfi.h> - -#ifdef CONFIG_APB_TIMER - -/* default memory mapped register base */ -#define LNW_SCU_ADDR 0xFF100000 -#define LNW_EXT_TIMER_OFFSET 0x1B800 -#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) -#define LNW_EXT_TIMER_PGOFFSET 0x800 - -/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ -#define APBT_MAX_FREQ 50000000 -#define APBT_MIN_FREQ 1000000 -#define APBT_MMAP_SIZE 1024 - -extern void apbt_time_init(void); -extern void apbt_setup_secondary_clock(void); - -extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); -extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); -extern int sfi_mtimer_num; - -#else /* CONFIG_APB_TIMER */ - -static inline void apbt_time_init(void) { } - -#endif -#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4e3099d9ae62..412b51e059c8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -197,16 +197,6 @@ static inline bool apic_needs_pit(void) { return true; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC -/* - * Make previous memory operations globally visible before - * sending the IPI through x2apic wrmsr. We need a serializing instruction or - * mfence for this. - */ -static inline void x2apic_wrmsr_fence(void) -{ - asm volatile("mfence" : : : "memory"); -} - static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || @@ -259,6 +249,7 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; +extern void __init x2apic_set_max_apicid(u32 apicid); extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) @@ -305,11 +296,10 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - /* dest_logical is used by the IPI functions */ - u32 dest_logical; u32 disable_esr; - u32 irq_delivery_mode; - u32 irq_dest_mode; + + enum apic_delivery_modes delivery_mode; + bool dest_mode_logical; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -520,12 +510,10 @@ static inline void apic_smt_update(void) { } #endif struct msi_msg; +struct irq_cfg; -#ifdef CONFIG_PCI_MSI -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg); -#else -# define x86_vector_msi_compose_msg NULL -#endif +extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar); extern void ioapic_zap_locks(void); diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 05e694ed8386..5716f22f81ac 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -432,15 +432,13 @@ struct local_apic { #define BAD_APICID 0xFFFFu #endif -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 +enum apic_delivery_modes { + APIC_DELIVERY_MODE_FIXED = 0, + APIC_DELIVERY_MODE_LOWESTPRIO = 1, + APIC_DELIVERY_MODE_SMI = 2, + APIC_DELIVERY_MODE_NMI = 4, + APIC_DELIVERY_MODE_INIT = 5, + APIC_DELIVERY_MODE_EXTINT = 7, }; #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b6cac6e9bb70..f732741ad7c7 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -199,7 +199,7 @@ static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { - return try_cmpxchg(&v->counter, old, new); + return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 809bd010a751..7886d0578fc9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -187,7 +187,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { - return try_cmpxchg(&v->counter, old, new); + return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 7f828fe49797..4819d5e5a335 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -84,4 +84,22 @@ do { \ #include <asm-generic/barrier.h> +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + asm volatile("mfence; lfence" : : : "memory"); +} + #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b63c7feab7..86b2e0dcc4bf 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_CACHEINFO_H #define _ASM_X86_CACHEINFO_H -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a8bfac131256..4d4ec5cbdc51 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,7 +221,7 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) -#define try_cmpxchg(ptr, pold, new) \ +#define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) /* diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0e327a01f50f..be09c7eac89f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -159,17 +159,6 @@ struct compat_shmid64_ds { compat_ulong_t __unused5; }; -/* - * The type of struct elf_prstatus.pr_reg in compatible core dumps. - */ -typedef struct user_regs_struct compat_elf_gregset_t; - -/* Full regset -- prstatus on x32, otherwise on ia32 */ -#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) -#define SET_PR_FPVALID(S, V, R) \ - do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ - while (0) - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) @@ -177,14 +166,13 @@ typedef struct user_regs_struct compat_elf_gregset_t; static inline void __user *arch_compat_alloc_user_space(long len) { - compat_uptr_t sp; - - if (test_thread_flag(TIF_IA32)) { - sp = task_pt_regs(current)->sp; - } else { - /* -128 for the x32 ABI redzone */ - sp = task_pt_regs(current)->sp - 128; - } + compat_uptr_t sp = task_pt_regs(current)->sp; + + /* + * -128 for the x32 ABI redzone. For IA32, it is not strictly + * necessary, but not harmful. + */ + sp -= 128; return (void __user *)round_down(sp - len, 16); } diff --git a/arch/x86/include/asm/copy_mc_test.h b/arch/x86/include/asm/copy_mc_test.h deleted file mode 100644 index e4991ba96726..000000000000 --- a/arch/x86/include/asm/copy_mc_test.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _COPY_MC_TEST_H_ -#define _COPY_MC_TEST_H_ - -#ifndef __ASSEMBLY__ -#ifdef CONFIG_COPY_MC_TEST -extern unsigned long copy_mc_test_src; -extern unsigned long copy_mc_test_dst; - -static inline void copy_mc_inject_src(void *addr) -{ - if (addr) - copy_mc_test_src = (unsigned long) addr; - else - copy_mc_test_src = ~0UL; -} - -static inline void copy_mc_inject_dst(void *addr) -{ - if (addr) - copy_mc_test_dst = (unsigned long) addr; - else - copy_mc_test_dst = ~0UL; -} -#else /* CONFIG_COPY_MC_TEST */ -static inline void copy_mc_inject_src(void *addr) -{ -} - -static inline void copy_mc_inject_dst(void *addr) -{ -} -#endif /* CONFIG_COPY_MC_TEST */ - -#else /* __ASSEMBLY__ */ -#include <asm/export.h> - -#ifdef CONFIG_COPY_MC_TEST -.macro COPY_MC_TEST_CTL - .pushsection .data - .align 8 - .globl copy_mc_test_src - copy_mc_test_src: - .quad 0 - EXPORT_SYMBOL_GPL(copy_mc_test_src) - .globl copy_mc_test_dst - copy_mc_test_dst: - .quad 0 - EXPORT_SYMBOL_GPL(copy_mc_test_dst) - .popsection -.endm - -.macro COPY_MC_TEST_SRC reg count target - leaq \count(\reg), %r9 - cmp copy_mc_test_src, %r9 - ja \target -.endm - -.macro COPY_MC_TEST_DST reg count target - leaq \count(\reg), %r9 - cmp copy_mc_test_dst, %r9 - ja \target -.endm -#else -.macro COPY_MC_TEST_CTL -.endm - -.macro COPY_MC_TEST_SRC reg count target -.endm - -.macro COPY_MC_TEST_DST reg count target -.endm -#endif /* CONFIG_COPY_MC_TEST */ -#endif /* __ASSEMBLY__ */ -#endif /* _COPY_MC_TEST_H_ */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 59bf91c57aa8..1728d4ce5730 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dad350d42ecf..cc96e26d69f7 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -236,11 +236,11 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ @@ -292,6 +292,7 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ @@ -335,6 +336,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ @@ -356,6 +358,7 @@ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ #define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ @@ -374,6 +377,7 @@ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ @@ -381,6 +385,13 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h deleted file mode 100644 index 777c0f63418c..000000000000 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ /dev/null @@ -1,118 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared glue code for 128bit block ciphers - */ - -#ifndef _CRYPTO_GLUE_HELPER_H -#define _CRYPTO_GLUE_HELPER_H - -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <asm/fpu/api.h> -#include <crypto/b128ops.h> - -typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -typedef void (*common_glue_xts_func_t)(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -struct common_glue_func_entry { - unsigned int num_blocks; /* number of blocks that @fn will process */ - union { - common_glue_func_t ecb; - common_glue_cbc_func_t cbc; - common_glue_ctr_func_t ctr; - common_glue_xts_func_t xts; - } fn_u; -}; - -struct common_glue_ctx { - unsigned int num_funcs; - int fpu_blocks_limit; /* -1 means fpu not needed at all */ - - /* - * First funcs entry must have largest num_blocks and last funcs entry - * must have num_blocks == 1! - */ - struct common_glue_func_entry funcs[]; -}; - -static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, - struct skcipher_walk *walk, - bool fpu_enabled, unsigned int nbytes) -{ - if (likely(fpu_blocks_limit < 0)) - return false; - - if (fpu_enabled) - return true; - - /* - * Vector-registers are only used when chunk to be processed is large - * enough, so do not enable FPU until it is necessary. - */ - if (nbytes < bsize * (unsigned int)fpu_blocks_limit) - return false; - - /* prevent sleeping if FPU is in use */ - skcipher_walk_atomise(walk); - - kernel_fpu_begin(); - return true; -} - -static inline void glue_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static inline void le128_to_be128(be128 *dst, const le128 *src) -{ - dst->a = cpu_to_be64(le64_to_cpu(src->a)); - dst->b = cpu_to_be64(le64_to_cpu(src->b)); -} - -static inline void be128_to_le128(le128 *dst, const be128 *src) -{ - dst->a = cpu_to_le64(be64_to_cpu(src->a)); - dst->b = cpu_to_le64(be64_to_cpu(src->b)); -} - -static inline void le128_inc(le128 *i) -{ - u64 a = le64_to_cpu(i->a); - u64 b = le64_to_cpu(i->b); - - b++; - if (!b) - a++; - - i->a = cpu_to_le64(a); - i->b = cpu_to_le64(b); -} - -extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - -extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, - struct skcipher_request *req); - -extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - -extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - -extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx, bool decrypt); - -extern void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, - const u8 *src, le128 *iv, - common_glue_func_t fn); - -#endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h deleted file mode 100644 index 251c2c89d7cf..000000000000 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_SERPENT_AVX_H -#define ASM_X86_SERPENT_AVX_H - -#include <crypto/b128ops.h> -#include <crypto/serpent.h> -#include <linux/types.h> - -struct crypto_skcipher; - -#define SERPENT_PARALLEL_BLOCKS 8 - -struct serpent_xts_ctx { - struct serpent_ctx tweak_ctx; - struct serpent_ctx crypt_ctx; -}; - -asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src); - -asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); - -extern void __serpent_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -extern void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv); - -extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen); - -#endif diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5861d34f9771..b7dd944dc867 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -62,6 +62,12 @@ # define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #endif +#ifdef CONFIG_X86_SGX +# define DISABLE_SGX 0 +#else +# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -74,7 +80,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP) +#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 @@ -85,6 +91,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bc9758ef292e..4d0b126835b8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -12,6 +12,7 @@ #include <linux/pgtable.h> extern unsigned long efi_fw_vendor, efi_config_table; +extern unsigned long efi_mixed_mode_stack_pa; /* * We map the EFI regions needed for runtime services non-contiguously, @@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table; #f " called with too many arguments (" #p ">" #n ")"); \ }) +static inline void efi_fpu_begin(void) +{ + /* + * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires + * that FCW and MXCSR (64-bit) must be initialized prior to calling + * UEFI code. (Oddly the spec does not require that the FPU stack + * be empty.) + */ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +} + +static inline void efi_fpu_end(void) +{ + kernel_fpu_end(); +} + #ifdef CONFIG_X86_32 #define arch_efi_call_virt_setup() \ ({ \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ }) #define arch_efi_call_virt_teardown() \ ({ \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #define arch_efi_call_virt(p, f, args...) p->f(args) @@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...); __efi_call(__VA_ARGS__); \ }) -/* - * struct efi_scratch - Scratch space used while switching to/from efi_mm - * @phys_stack: stack used during EFI Mixed Mode - * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm - */ -struct efi_scratch { - u64 phys_stack; - struct mm_struct *prev_mm; -} __packed; - #define arch_efi_call_virt_setup() \ ({ \ efi_sync_low_kernel_mappings(); \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ - efi_switch_mm(&efi_mm); \ + efi_enter_mm(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -117,9 +124,9 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - efi_switch_mm(efi_scratch.prev_mm); \ + efi_leave_mm(); \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #ifdef CONFIG_KASAN @@ -136,7 +143,6 @@ struct efi_scratch { #endif /* CONFIG_X86_32 */ -extern struct efi_scratch efi_scratch; extern int __init efi_memblock_x86_reserve_range(void); extern void __init efi_print_memmap(void); extern void __init efi_map_region(efi_memory_desc_t *md); @@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); -extern void efi_switch_mm(struct mm_struct *mm); -extern void efi_recover_from_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); +void efi_enter_mm(void); +void efi_leave_mm(void); + /* kexec external ABI */ struct efi_setup_data { u64 fw_vendor; @@ -213,8 +221,6 @@ static inline bool efi_is_64bit(void) static inline bool efi_is_native(void) { - if (!IS_ENABLED(CONFIG_X86_64)) - return true; return efi_is_64bit(); } @@ -382,4 +388,7 @@ static inline void efi_fake_memmap_early(void) } #endif +#define arch_ima_efi_boot_mode \ + ({ extern struct boot_params boot_params; boot_params.secure_boot; }) + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index b9a5d488f1a5..9224d40cdefe 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -186,8 +186,9 @@ static inline void elf_common_init(struct thread_struct *t, #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread compat_start_thread +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32); +#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ + compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64) void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ @@ -361,9 +362,9 @@ do { \ #define AT_SYSINFO 32 #define COMPAT_ARCH_DLINFO \ -if (test_thread_flag(TIF_X32)) \ +if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ -else \ +else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) @@ -382,8 +383,12 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages + int uses_interp, bool x32); +#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ + compat_arch_setup_additional_pages(bprm, interpreter, \ + (ex->e_machine == EM_X86_64)) + +extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); /* Do not change the values. See get_align_mask() */ enum align_flags { diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h new file mode 100644 index 000000000000..f1b6c7a8d8fc --- /dev/null +++ b/arch/x86/include/asm/elfcore-compat.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_ELFCORE_COMPAT_H +#define _ASM_X86_ELFCORE_COMPAT_H + +#include <asm/user32.h> + +/* + * On amd64 we have two 32bit ABIs - i386 and x32. The latter + * has bigger registers, so we use it for compat_elf_regset_t. + * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID + * are used to choose the size and location of ->pr_fpvalid of + * the layout actually used. + */ +typedef struct user_regs_struct compat_elf_gregset_t; + +struct i386_elf_prstatus +{ + struct compat_elf_prstatus_common common; + struct user_regs_struct32 pr_reg; + compat_int_t pr_fpvalid; +}; + +#define PRSTATUS_SIZE \ + (user_64bit_mode(task_pt_regs(current)) \ + ? sizeof(struct compat_elf_prstatus) \ + : sizeof(struct i386_elf_prstatus)) +#define SET_PR_FPVALID(S) \ + (*(user_64bit_mode(task_pt_regs(current)) \ + ? &(S)->pr_fpvalid \ + : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1) + +#endif diff --git a/arch/x86/include/asm/enclu.h b/arch/x86/include/asm/enclu.h new file mode 100644 index 000000000000..b1314e41a744 --- /dev/null +++ b/arch/x86/include/asm/enclu.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ENCLU_H +#define _ASM_X86_ENCLU_H + +#define EENTER 0x02 +#define ERESUME 0x03 +#define EEXIT 0x04 + +#endif /* _ASM_X86_ENCLU_H */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 6fe54b2813c1..2b87b191b3b8 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -43,8 +43,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) } #define arch_check_user_regs arch_check_user_regs -#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP) - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 77217bd292bd..d0dcefb5cc59 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,13 +14,20 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +#include <asm/kmap_size.h> + /* * Exposed to assembly code for setting up initial page tables. Cannot be * calculated in assembly code (fixmap entries are an enum), but is sanity * checked in the actual fixmap C code to make sure that the fixmap is * covered fully. */ -#define FIXMAP_PMD_NUM 2 +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define FIXMAP_PMD_NUM 2 +#else +# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512)) +# define FIXMAP_PMD_NUM (KM_PMDS + 2) +#endif /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ #define FIXMAP_PMD_TOP 507 @@ -31,7 +38,6 @@ #include <asm/pgtable_types.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> -#include <asm/kmap_types.h> #else #include <uapi/asm/vsyscall.h> #endif @@ -92,9 +98,9 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_32 +#ifdef CONFIG_KMAP_LOCAL FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -102,9 +108,6 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT_XXL FIX_PARAVIRT_BOOTMAP, #endif -#ifdef CONFIG_X86_INTEL_MID - FIX_LNW_VRTC, -#endif #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ @@ -151,7 +154,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index dcd9503b1098..ed33a14188f6 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,30 +16,68 @@ * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods * of time. - * If you intend to use the FPU in softirq you need to check first with + * If you intend to use the FPU in irq/softirq you need to check first with * irq_fpu_usable() if it is possible. */ -extern void kernel_fpu_begin(void); + +/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ +#define KFPU_387 _BITUL(0) /* 387 state will be initialized */ +#define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ + +extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); +/* Code that is unaware of kernel_fpu_begin_mask() can use this */ +static inline void kernel_fpu_begin(void) +{ +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + kernel_fpu_begin_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +#endif +} + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. * A context switch will (and softirq might) save CPU's FPU registers to * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. + * + * local_bh_disable() protects against both preemption and soft interrupts + * on !RT kernels. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. + * + * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { - preempt_disable(); - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); } static inline void fpregs_unlock(void) { - local_bh_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } #ifdef CONFIG_X86_DEBUG_FPU diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 84b9449be080..9f3130f40807 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -41,6 +41,24 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned regs->orig_ax = addr; } +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs * +arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + /* Only when FL_SAVE_REGS is set, cs will be non zero */ + if (!fregs->regs.cs) + return NULL; + return &fregs->regs; +} + +#define ftrace_instruction_pointer_set(fregs, _ip) \ + do { (fregs)->regs.ip = (_ip); } while (0) +#endif + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 0f420b24e0fc..032e020853aa 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -23,7 +23,6 @@ #include <linux/interrupt.h> #include <linux/threads.h> -#include <asm/kmap_types.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/fixmap.h> @@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_pfn(unsigned long pfn); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); - #define flush_cache_kmaps() do { } while (0) +#define arch_kmap_local_post_map(vaddr, pteval) \ + arch_flush_lazy_mmu_mode() + +#define arch_kmap_local_post_unmap(vaddr) \ + do { \ + flush_tlb_one_kernel((vaddr)); \ + arch_flush_lazy_mmu_mode(); \ + } while (0) + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 6352dee37cda..ab9f3dd87c80 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,17 +74,6 @@ extern void hpet_disable(void); extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); -struct irq_data; -struct hpet_channel; -struct irq_domain; - -extern void hpet_msi_unmask(struct irq_data *data); -extern void hpet_msi_mask(struct irq_data *data); -extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg); -extern struct irq_domain *hpet_create_irq_domain(int hpet_id); -extern int hpet_assign_irq(struct irq_domain *domain, - struct hpet_channel *hc, int dev_num); - #ifdef CONFIG_HPET_EMULATE_RTC #include <linux/interrupt.h> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a4aeeaace040..d465ece58151 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -39,18 +39,16 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_PCI_MSI, X86_IRQ_ALLOC_TYPE_PCI_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_AMDVI, X86_IRQ_ALLOC_TYPE_UV, - X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT, - X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT, }; struct ioapic_alloc_info { - int pin; - int node; - u32 trigger : 1; - u32 polarity : 1; - u32 valid : 1; - struct IO_APIC_route_entry *entry; + int pin; + int node; + u32 is_level : 1; + u32 active_low : 1; + u32 valid : 1; }; struct uv_alloc_info { diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0ed20e8bba9e..e6cd3fee562b 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -21,7 +21,16 @@ #define HYPERV_CPUID_FEATURES 0x40000003 #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 #define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C + +#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 +#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ + +#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 +/* Support for the extended IOAPIC RTE format */ +#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 @@ -104,6 +113,15 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) /* + * CPU management features identification. + * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits. + */ +#define HV_X64_START_LOGICAL_PROCESSOR BIT(0) +#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1) +#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2) +#define HV_X64_RESERVED_IDENTITY_BIT BIT(31) + +/* * Virtual processor will never share a physical core with another virtual * processor, except for virtual processors that are reported as sibling SMT * threads. @@ -115,6 +133,20 @@ #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) +/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ +#define HV_PARAVISOR_PRESENT BIT(0) + +/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */ +#define HV_ISOLATION_TYPE GENMASK(3, 0) +#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) +#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) + +enum hv_isolation_type { + HV_ISOLATION_TYPE_NONE = 0, + HV_ISOLATION_TYPE_VBS = 1, + HV_ISOLATION_TYPE_SNP = 2 +}; + /* Hyper-V specific model specific registers (MSRs) */ /* MSR used to identify the guest OS. */ @@ -516,6 +548,19 @@ struct hv_partition_assist_pg { u32 tlb_lock_count; }; +enum hv_interrupt_type { + HV_X64_INTERRUPT_TYPE_FIXED = 0x0000, + HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001, + HV_X64_INTERRUPT_TYPE_SMI = 0x0002, + HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003, + HV_X64_INTERRUPT_TYPE_NMI = 0x0004, + HV_X64_INTERRUPT_TYPE_INIT = 0x0005, + HV_X64_INTERRUPT_TYPE_SIPI = 0x0006, + HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007, + HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008, + HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009, + HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A, +}; #include <asm-generic/hyperv-tlfs.h> diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2442eb0ac2f..5eb3bdf36a41 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,9 +11,6 @@ #include <asm/irq_stack.h> -bool idtentry_enter_nmi(struct pt_regs *regs); -void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware @@ -190,23 +187,22 @@ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code) * has to be done in the function body if necessary. */ #define DEFINE_IDTENTRY_IRQ(func) \ -static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ +static void __##func(struct pt_regs *regs, u32 vector); \ \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ irqentry_state_t state = irqentry_enter(regs); \ + u32 vector = (u32)(u8)error_code; \ \ instrumentation_begin(); \ - irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ - __##func (regs, (u8)error_code); \ - irq_exit_rcu(); \ + run_irq_on_irqstack_cond(__##func, regs, vector); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ -static __always_inline void __##func(struct pt_regs *regs, u8 vector) +static noinline void __##func(struct pt_regs *regs, u32 vector) /** * DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points @@ -240,10 +236,8 @@ __visible noinstr void func(struct pt_regs *regs) \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ run_sysvec_on_irqstack_cond(__##func, regs); \ - irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ @@ -588,6 +582,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); #else DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); #endif +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); +#endif #endif /* NMI */ @@ -608,6 +605,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); +#endif /* #VC */ #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -616,6 +616,7 @@ DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication); #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); +DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif /* Device interrupts common/spurious */ diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index a8c3d284fa46..95a448fbb44c 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -7,9 +7,12 @@ * Copyright (C) IBM Corporation, 2009 */ +#include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) + struct insn_field { union { insn_value_t value; @@ -20,6 +23,48 @@ struct insn_field { unsigned char nbytes; }; +static inline void insn_field_set(struct insn_field *p, insn_value_t v, + unsigned char n) +{ + p->value = v; + p->nbytes = n; +} + +static inline void insn_set_byte(struct insn_field *p, unsigned char n, + insn_byte_t v) +{ + p->bytes[n] = v; +} + +#else + +struct insn_field { + insn_value_t value; + union { + insn_value_t little; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +static inline void insn_field_set(struct insn_field *p, insn_value_t v, + unsigned char n) +{ + p->value = v; + p->little = __cpu_to_le32(v); + p->nbytes = n; +} + +static inline void insn_set_byte(struct insn_field *p, unsigned char n, + insn_byte_t v) +{ + p->bytes[n] = v; + p->value = __le32_to_cpu(p->little); +} +#endif + struct insn { struct insn_field prefixes; /* * Prefixes diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index bd7f02480ca1..438ccd4f3cc4 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -143,21 +143,6 @@ .macro MODRM mod opd1 opd2 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5e658ba2654a..9abe842dbd84 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -97,6 +97,7 @@ #define INTEL_FAM6_LAKEFIELD 0x8A #define INTEL_FAM6_ALDERLAKE 0x97 +#define INTEL_FAM6_ALDERLAKE_L 0x9A /* "Small Core" Processors (Atom) */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index cf0e25f45422..c201083b34f6 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -1,15 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * intel-mid.h: Intel MID specific setup code + * Intel MID specific setup code * - * (C) Copyright 2009 Intel Corporation + * (C) Copyright 2009, 2021 Intel Corporation */ #ifndef _ASM_X86_INTEL_MID_H #define _ASM_X86_INTEL_MID_H -#include <linux/sfi.h> #include <linux/pci.h> -#include <linux/platform_device.h> extern int intel_mid_pci_init(void); extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); @@ -22,93 +20,18 @@ extern void intel_mid_pwr_power_off(void); extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); -extern int get_gpio_by_name(const char *name); -extern int __init sfi_parse_mrtc(struct sfi_table_header *table); -extern int __init sfi_parse_mtmr(struct sfi_table_header *table); -extern int sfi_mrtc_num; -extern struct sfi_rtc_table_entry sfi_mrtc_array[]; - -/* - * Here defines the array of devices platform data that IAFW would export - * through SFI "DEVS" table, we use name and type to match the device and - * its platform data. - */ -struct devs_id { - char name[SFI_NAME_LEN + 1]; - u8 type; - u8 delay; - u8 msic; - void *(*get_platform_data)(void *info); -}; - -#define sfi_device(i) \ - static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \ - __section(".x86_intel_mid_dev.init") = &i - -/** -* struct mid_sd_board_info - template for SD device creation -* @name: identifies the driver -* @bus_num: board-specific identifier for a given SD controller -* @max_clk: the maximum frequency device supports -* @platform_data: the particular data stored there is driver-specific -*/ -struct mid_sd_board_info { - char name[SFI_NAME_LEN]; - int bus_num; - unsigned short addr; - u32 max_clk; - void *platform_data; -}; - -/* - * Medfield is the follow-up of Moorestown, it combines two chip solution into - * one. Other than that it also added always-on and constant tsc and lapic - * timers. Medfield is the platform name, and the chip name is called Penwell - * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be - * identified via MSRs. - */ -enum intel_mid_cpu_type { - /* 1 was Moorestown */ - INTEL_MID_CPU_CHIP_PENWELL = 2, - INTEL_MID_CPU_CHIP_CLOVERVIEW, - INTEL_MID_CPU_CHIP_TANGIER, -}; - -extern enum intel_mid_cpu_type __intel_mid_cpu_chip; - #ifdef CONFIG_X86_INTEL_MID -static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) -{ - return __intel_mid_cpu_chip; -} - -static inline bool intel_mid_has_msic(void) -{ - return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL); -} - extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); #else /* !CONFIG_X86_INTEL_MID */ -#define intel_mid_identify_cpu() 0 -#define intel_mid_has_msic() 0 - static inline void intel_scu_devices_create(void) { } static inline void intel_scu_devices_destroy(void) { } #endif /* !CONFIG_X86_INTEL_MID */ -enum intel_mid_timer_options { - INTEL_MID_TIMER_DEFAULT, - INTEL_MID_TIMER_APBT_ONLY, - INTEL_MID_TIMER_LAPIC_APBT, -}; - -extern enum intel_mid_timer_options intel_mid_timer_options; - /* Bus Select SoC Fuse value */ #define BSEL_SOC_FUSE_MASK 0x7 /* FSB 133MHz */ @@ -118,16 +41,4 @@ extern enum intel_mid_timer_options intel_mid_timer_options; /* FSB 83MHz */ #define BSEL_SOC_FUSE_111 0x7 -#define SFI_MTMR_MAX_NUM 8 -#define SFI_MRTC_MAX 8 - -/* VRTC timer */ -#define MRST_VRTC_MAP_SZ 1024 -/* #define MRST_VRTC_PGOFFSET 0xc00 */ - -extern void intel_mid_rtc_init(void); - -/* The offset for the mapping of global gpio pin to irq */ -#define INTEL_MID_IRQ_OFFSET 0x100 - #endif /* _ASM_X86_INTEL_MID_H */ diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h deleted file mode 100644 index 0b44b1abe4d9..000000000000 --- a/arch/x86/include/asm/intel_mid_vrtc.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _INTEL_MID_VRTC_H -#define _INTEL_MID_VRTC_H - -extern unsigned char vrtc_cmos_read(unsigned char reg); -extern void vrtc_cmos_write(unsigned char val, unsigned char reg); -extern void vrtc_get_time(struct timespec64 *now); -extern int vrtc_set_mmss(const struct timespec64 *now); - -#endif diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 11d457af68c5..8537f597d20a 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -65,6 +65,4 @@ static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int c inlen, out, outlen); } -#include <asm/intel_scu_ipc_legacy.h> - #endif diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h deleted file mode 100644 index 4cf13fecb673..000000000000 --- a/arch/x86/include/asm/intel_scu_ipc_legacy.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ -#define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ - -#include <linux/notifier.h> - -#define IPCMSG_INDIRECT_READ 0x02 -#define IPCMSG_INDIRECT_WRITE 0x05 - -#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ - -#define IPCMSG_WARM_RESET 0xF0 -#define IPCMSG_COLD_RESET 0xF1 -#define IPCMSG_SOFT_RESET 0xF2 -#define IPCMSG_COLD_BOOT 0xF3 - -#define IPCMSG_VRTC 0xFA /* Set vRTC device */ -/* Command id associated with message IPCMSG_VRTC */ -#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ -#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ - -/* Don't call these in new code - they will be removed eventually */ - -/* Read single register */ -static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data) -{ - return intel_scu_ipc_dev_ioread8(NULL, addr, data); -} - -/* Read a vector */ -static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len) -{ - return intel_scu_ipc_dev_readv(NULL, addr, data, len); -} - -/* Write single register */ -static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data) -{ - return intel_scu_ipc_dev_iowrite8(NULL, addr, data); -} - -/* Write a vector */ -static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len) -{ - return intel_scu_ipc_dev_writev(NULL, addr, data, len); -} - -/* Update single register based on the mask */ -static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask) -{ - return intel_scu_ipc_dev_update(NULL, addr, data, mask); -} - -/* Issue commands to the SCU with or without data */ -static inline int intel_scu_ipc_simple_command(int cmd, int sub) -{ - return intel_scu_ipc_dev_simple_command(NULL, cmd, sub); -} - -static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen) -{ - /* New API takes both inlen and outlen as bytes so convert here */ - size_t inbytes = inlen * sizeof(u32); - size_t outbytes = outlen * sizeof(u32); - - return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes, - inlen, out, outbytes); -} - -extern struct blocking_notifier_head intel_scu_notifier; - -static inline void intel_scu_notifier_add(struct notifier_block *nb) -{ - blocking_notifier_chain_register(&intel_scu_notifier, nb); -} - -static inline void intel_scu_notifier_remove(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&intel_scu_notifier, nb); -} - -static inline int intel_scu_notifier_post(unsigned long v, void *p) -{ - return blocking_notifier_call_chain(&intel_scu_notifier, v, p); -} - -#define SCU_AVAILABLE 1 -#define SCU_DOWN 2 - -#endif diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a1a26f6d3aa4..437aa8d00e53 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -13,15 +13,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -/* I/O Unit Redirection Table */ -#define IO_APIC_REDIR_VECTOR_MASK 0x000FF -#define IO_APIC_REDIR_DEST_LOGICAL 0x00800 -#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 -#define IO_APIC_REDIR_SEND_PENDING (1 << 12) -#define IO_APIC_REDIR_REMOTE_IRR (1 << 14) -#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) -#define IO_APIC_REDIR_MASKED (1 << 16) - /* * The structure of the IO-APIC: */ @@ -65,52 +56,40 @@ union IO_APIC_reg_03 { }; struct IO_APIC_route_entry { - __u32 vector : 8, - delivery_mode : 3, /* 000: FIXED - * 001: lowest prio - * 111: ExtINT - */ - dest_mode : 1, /* 0: physical, 1: logical */ - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, /* 0: edge, 1: level */ - mask : 1, /* 0: enabled, 1: disabled */ - __reserved_2 : 15; - - __u32 __reserved_3 : 24, - dest : 8; -} __attribute__ ((packed)); - -struct IR_IO_APIC_route_entry { - __u64 vector : 8, - zero : 3, - index2 : 1, - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, - mask : 1, - reserved : 31, - format : 1, - index : 15; + union { + struct { + u64 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + delivery_status : 1, + active_low : 1, + irr : 1, + is_level : 1, + masked : 1, + reserved_0 : 15, + reserved_1 : 17, + virt_destid_8_14 : 7, + destid_0_7 : 8; + }; + struct { + u64 ir_shared_0 : 8, + ir_zero : 3, + ir_index_15 : 1, + ir_shared_1 : 5, + ir_reserved_0 : 31, + ir_format : 1, + ir_index_0_14 : 15; + }; + struct { + u64 w1 : 32, + w2 : 32; + }; + }; } __attribute__ ((packed)); struct irq_alloc_info; struct ioapic_domain_cfg; -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#define IOAPIC_MASKED 1 -#define IOAPIC_UNMASKED 0 - -#define IOAPIC_POL_HIGH 0 -#define IOAPIC_POL_LOW 1 - -#define IOAPIC_DEST_MODE_PHYSICAL 0 -#define IOAPIC_DEST_MODE_LOGICAL 1 - #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index bacf68c4d70e..e2de092fc38c 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -9,19 +9,14 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void -iounmap_atomic(void __iomem *kvaddr); +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); -int -iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); - -void -iomap_free(resource_size_t base, unsigned long size); +void iomap_free(resource_size_t base, unsigned long size); #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 528c8a71fe7f..768aa234cbb4 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -25,8 +25,6 @@ static inline int irq_canonicalize(int irq) extern int irq_init_percpu_irqstack(unsigned int cpu); -#define __ARCH_HAS_DO_SOFTIRQ - struct irq_desc; extern void fixup_irqs(void); @@ -40,8 +38,6 @@ extern void native_init_IRQ(void); extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); - extern void init_ISA_irqs(void); extern void __init init_IRQ(void); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index af4a151d70b3..7cc49432187f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,9 +44,6 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -extern struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info); - /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ extern struct irq_domain * arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); @@ -71,11 +68,5 @@ static inline void panic_if_irq_remap(const char *msg) { } -static inline struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info) -{ - return NULL; -} - #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 775816965c6a..9b2a0ff76c73 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -7,100 +7,217 @@ #include <asm/processor.h> #ifdef CONFIG_X86_64 -static __always_inline bool irqstack_active(void) -{ - return __this_cpu_read(irq_count) != -1; -} - -void asm_call_on_stack(void *sp, void (*func)(void), void *arg); -void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs), - struct pt_regs *regs); -void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc), - struct irq_desc *desc); -static __always_inline void __run_on_irqstack(void (*func)(void)) -{ - void *tos = __this_cpu_read(hardirq_stack_ptr); - - __this_cpu_add(irq_count, 1); - asm_call_on_stack(tos - 8, func, NULL); - __this_cpu_sub(irq_count, 1); +/* + * Macro to inline switching to an interrupt stack and invoking function + * calls from there. The following rules apply: + * + * - Ordering: + * + * 1. Write the stack pointer into the top most place of the irq + * stack. This ensures that the various unwinders can link back to the + * original stack. + * + * 2. Switch the stack pointer to the top of the irq stack. + * + * 3. Invoke whatever needs to be done (@asm_call argument) + * + * 4. Pop the original stack pointer from the top of the irq stack + * which brings it back to the original stack where it left off. + * + * - Function invocation: + * + * To allow flexible usage of the macro, the actual function code including + * the store of the arguments in the call ABI registers is handed in via + * the @asm_call argument. + * + * - Local variables: + * + * @tos: + * The @tos variable holds a pointer to the top of the irq stack and + * _must_ be allocated in a non-callee saved register as this is a + * restriction coming from objtool. + * + * Note, that (tos) is both in input and output constraints to ensure + * that the compiler does not assume that R11 is left untouched in + * case this macro is used in some place where the per cpu interrupt + * stack pointer is used again afterwards + * + * - Function arguments: + * The function argument(s), if any, have to be defined in register + * variables at the place where this is invoked. Storing the + * argument(s) in the proper register(s) is part of the @asm_call + * + * - Constraints: + * + * The constraints have to be done very carefully because the compiler + * does not know about the assembly call. + * + * output: + * As documented already above the @tos variable is required to be in + * the output constraints to make the compiler aware that R11 cannot be + * reused after the asm() statement. + * + * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is + * required as well as this prevents certain creative GCC variants from + * misplacing the ASM code. + * + * input: + * - func: + * Immediate, which tells the compiler that the function is referenced. + * + * - tos: + * Register. The actual register is defined by the variable declaration. + * + * - function arguments: + * The constraints are handed in via the 'argconstr' argument list. They + * describe the register arguments which are used in @asm_call. + * + * clobbers: + * Function calls can clobber anything except the callee-saved + * registers. Tell the compiler. + */ +#define call_on_irqstack(func, asm_call, argconstr...) \ +{ \ + register void *tos asm("r11"); \ + \ + tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \ + \ + asm_inline volatile( \ + "movq %%rsp, (%[tos]) \n" \ + "movq %[tos], %%rsp \n" \ + \ + asm_call \ + \ + "popq %%rsp \n" \ + \ + : "+r" (tos), ASM_CALL_CONSTRAINT \ + : [__func] "i" (func), [tos] "r" (tos) argconstr \ + : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \ + "memory" \ + ); \ } -static __always_inline void -__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), - struct pt_regs *regs) -{ - void *tos = __this_cpu_read(hardirq_stack_ptr); - - __this_cpu_add(irq_count, 1); - asm_call_sysvec_on_stack(tos - 8, func, regs); - __this_cpu_sub(irq_count, 1); +/* Macros to assert type correctness for run_*_on_irqstack macros */ +#define assert_function_type(func, proto) \ + static_assert(__builtin_types_compatible_p(typeof(&func), proto)) + +#define assert_arg_type(arg, proto) \ + static_assert(__builtin_types_compatible_p(typeof(arg), proto)) + +/* + * Macro to invoke system vector and device interrupt C handlers. + */ +#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...) \ +{ \ + /* \ + * User mode entry and interrupt on the irq stack do not \ + * switch stacks. If from user mode the task stack is empty. \ + */ \ + if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + irq_enter_rcu(); \ + func(c_args); \ + irq_exit_rcu(); \ + } else { \ + /* \ + * Mark the irq stack inuse _before_ and unmark _after_ \ + * switching stacks. Interrupts are disabled in both \ + * places. Invoke the stack switch macro with the call \ + * sequence which matches the above direct invocation. \ + */ \ + __this_cpu_write(hardirq_stack_inuse, true); \ + call_on_irqstack(func, asm_call, constr); \ + __this_cpu_write(hardirq_stack_inuse, false); \ + } \ } -static __always_inline void -__run_irq_on_irqstack(void (*func)(struct irq_desc *desc), - struct irq_desc *desc) -{ - void *tos = __this_cpu_read(hardirq_stack_ptr); - - __this_cpu_add(irq_count, 1); - asm_call_irq_on_stack(tos - 8, func, desc); - __this_cpu_sub(irq_count, 1); +/* + * Function call sequence for __call_on_irqstack() for system vectors. + * + * Note that irq_enter_rcu() and irq_exit_rcu() do not use the input + * mechanism because these functions are global and cannot be optimized out + * when compiling a particular source file which uses one of these macros. + * + * The argument (regs) does not need to be pushed or stashed in a callee + * saved register to be safe vs. the irq_enter_rcu() call because the + * clobbers already prevent the compiler from storing it in a callee + * clobbered register. As the compiler has to preserve @regs for the final + * call to idtentry_exit() anyway, it's likely that it does not cause extra + * effort for this asm magic. + */ +#define ASM_CALL_SYSVEC \ + "call irq_enter_rcu \n" \ + "movq %[arg1], %%rdi \n" \ + "call %P[__func] \n" \ + "call irq_exit_rcu \n" + +#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs) + +#define run_sysvec_on_irqstack_cond(func, regs) \ +{ \ + assert_function_type(func, void (*)(struct pt_regs *)); \ + assert_arg_type(regs, struct pt_regs *); \ + \ + call_on_irqstack_cond(func, regs, ASM_CALL_SYSVEC, \ + SYSVEC_CONSTRAINTS, regs); \ } -#else /* CONFIG_X86_64 */ -static inline bool irqstack_active(void) { return false; } -static inline void __run_on_irqstack(void (*func)(void)) { } -static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), - struct pt_regs *regs) { } -static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc), - struct irq_desc *desc) { } -#endif /* !CONFIG_X86_64 */ - -static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return false; - if (!regs) - return !irqstack_active(); - return !user_mode(regs) && !irqstack_active(); +/* + * As in ASM_CALL_SYSVEC above the clobbers force the compiler to store + * @regs and @vector in callee saved registers. + */ +#define ASM_CALL_IRQ \ + "call irq_enter_rcu \n" \ + "movq %[arg1], %%rdi \n" \ + "movl %[arg2], %%esi \n" \ + "call %P[__func] \n" \ + "call irq_exit_rcu \n" + +#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector) + +#define run_irq_on_irqstack_cond(func, regs, vector) \ +{ \ + assert_function_type(func, void (*)(struct pt_regs *, u32)); \ + assert_arg_type(regs, struct pt_regs *); \ + assert_arg_type(vector, u32); \ + \ + call_on_irqstack_cond(func, regs, ASM_CALL_IRQ, \ + IRQ_CONSTRAINTS, regs, vector); \ } - -static __always_inline void run_on_irqstack_cond(void (*func)(void), - struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (irq_needs_irq_stack(regs)) - __run_on_irqstack(func); - else - func(); +#define ASM_CALL_SOFTIRQ \ + "call %P[__func] \n" + +/* + * Macro to invoke __do_softirq on the irq stack. This is only called from + * task context when bottom halfs are about to be reenabled and soft + * interrupts are pending to be processed. The interrupt stack cannot be in + * use here. + */ +#define do_softirq_own_stack() \ +{ \ + __this_cpu_write(hardirq_stack_inuse, true); \ + call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \ + __this_cpu_write(hardirq_stack_inuse, false); \ } -static __always_inline void -run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs), - struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (irq_needs_irq_stack(regs)) - __run_sysvec_on_irqstack(func, regs); - else - func(regs); +#else /* CONFIG_X86_64 */ +/* System vector handlers always run on the stack they interrupted. */ +#define run_sysvec_on_irqstack_cond(func, regs) \ +{ \ + irq_enter_rcu(); \ + func(regs); \ + irq_exit_rcu(); \ } -static __always_inline void -run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc, - struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (irq_needs_irq_stack(regs)) - __run_irq_on_irqstack(func, desc); - else - func(desc); +/* Switches to the irq stack within func() */ +#define run_irq_on_irqstack_cond(func, regs, vector) \ +{ \ + irq_enter_rcu(); \ + func(regs, vector); \ + irq_exit_rcu(); \ } +#endif /* !CONFIG_X86_64 */ + #endif diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index cd684d45cb5f..125c23b7bad3 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -12,6 +12,9 @@ enum { X86_IRQ_ALLOC_LEGACY = 0x2, }; +extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); +extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec); + extern struct irq_domain *x86_vector_domain; extern void init_irq_alloc_info(struct irq_alloc_info *info, diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2dfc8d380dab..144d70ea4393 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void) return flags; } -extern inline void native_restore_fl(unsigned long flags); -extern inline void native_restore_fl(unsigned long flags) -{ - asm volatile("push %0 ; popf" - : /* no output */ - :"g" (flags) - :"memory", "cc"); -} - static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); @@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void) return native_save_fl(); } -static __always_inline void arch_local_irq_restore(unsigned long flags) -{ - native_restore_fl(flags); -} - static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); @@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void) #define SAVE_FLAGS(x) pushfq; popq %rax #endif -#define SWAPGS swapgs -/* - * Currently paravirt can't handle swapgs nicely when we - * don't have a stack we can rely on (such as a user space - * stack). So we either find a way around these or just fault - * and emulate if a guest tries to call swapgs directly. - * - * Either way, this is a good way to document that we don't - * have a reliable stack. x86_64 only. - */ -#define SWAPGS_UNSAFE_STACK swapgs - #define INTERRUPT_RETURN jmp native_iret -#define USERGS_SYSRET64 \ - swapgs; \ - sysretq; -#define USERGS_SYSRET32 \ - swapgs; \ - sysretl #else #define INTERRUPT_RETURN iret @@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } + +static __always_inline void arch_local_irq_restore(unsigned long flags) +{ + if (!arch_irqs_disabled_flags(flags)) + arch_local_irq_enable(); +} +#else +#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PV +#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV +#else +#define SWAPGS swapgs +#endif +#endif #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h new file mode 100644 index 000000000000..97bbb4a9083a --- /dev/null +++ b/arch/x86/include/asm/kfence.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KFENCE support. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef _ASM_X86_KFENCE_H +#define _ASM_X86_KFENCE_H + +#include <linux/bug.h> +#include <linux/kfence.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/set_memory.h> +#include <asm/tlbflush.h> + +/* Force 4K pages for __kfence_pool. */ +static inline bool arch_kfence_init_pool(void) +{ + unsigned long addr; + + for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); + addr += PAGE_SIZE) { + unsigned int level; + + if (!lookup_address(addr, &level)) + return false; + + if (level != PG_LEVEL_4K) + set_memory_4k(addr, 1); + } + + return true; +} + +/* Protect the given page and flush TLB. */ +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + unsigned int level; + pte_t *pte = lookup_address(addr, &level); + + if (WARN_ON(!pte || level != PG_LEVEL_4K)) + return false; + + /* + * We need to avoid IPIs, as we may get KFENCE allocations or faults + * with interrupts disabled. Therefore, the below is best-effort, and + * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; + * lazy fault handling takes care of faults after the page is PRESENT. + */ + + if (protect) + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + else + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + + /* Flush this CPU's TLB. */ + flush_tlb_one_kernel(addr); + return true; +} + +#endif /* _ASM_X86_KFENCE_H */ diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h deleted file mode 100644 index 04ab8266e347..000000000000 --- a/arch/x86/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KMAP_TYPES_H -#define _ASM_X86_KMAP_TYPES_H - -#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 991a7ad540c7..d20a3d6be36e 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -58,14 +58,17 @@ struct arch_specific_insn { /* copy of the original instruction */ kprobe_opcode_t *insn; /* - * boostable = false: This instruction type is not boostable. - * boostable = true: This instruction has been boosted: we have + * boostable = 0: This instruction type is not boostable. + * boostable = 1: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's * a post_handler). */ - bool boostable; - bool if_modifier; + unsigned boostable:1; + unsigned if_modifier:1; + unsigned is_call:1; + unsigned is_pushf:1; + unsigned is_abs_ip:1; /* Number of bytes of text poked */ int tp_len; }; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h new file mode 100644 index 000000000000..323641097f63 --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate + * "static_call()"s. They are also intended for use when defining + * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those + * functions that follow the [svm|vmx]_func_name convention. + * KVM_X86_OP_NULL() can leave a NULL definition for the + * case where there is no definition or a function name that + * doesn't match the typical naming convention is supplied. + */ +KVM_X86_OP_NULL(hardware_enable) +KVM_X86_OP_NULL(hardware_disable) +KVM_X86_OP_NULL(hardware_unsetup) +KVM_X86_OP_NULL(cpu_has_accelerated_tpr) +KVM_X86_OP(has_emulated_msr) +KVM_X86_OP(vcpu_after_set_cpuid) +KVM_X86_OP(vm_init) +KVM_X86_OP_NULL(vm_destroy) +KVM_X86_OP(vcpu_create) +KVM_X86_OP(vcpu_free) +KVM_X86_OP(vcpu_reset) +KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(vcpu_load) +KVM_X86_OP(vcpu_put) +KVM_X86_OP(update_exception_bitmap) +KVM_X86_OP(get_msr) +KVM_X86_OP(set_msr) +KVM_X86_OP(get_segment_base) +KVM_X86_OP(get_segment) +KVM_X86_OP(get_cpl) +KVM_X86_OP(set_segment) +KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(set_cr0) +KVM_X86_OP(is_valid_cr4) +KVM_X86_OP(set_cr4) +KVM_X86_OP(set_efer) +KVM_X86_OP(get_idt) +KVM_X86_OP(set_idt) +KVM_X86_OP(get_gdt) +KVM_X86_OP(set_gdt) +KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr7) +KVM_X86_OP(cache_reg) +KVM_X86_OP(get_rflags) +KVM_X86_OP(set_rflags) +KVM_X86_OP(tlb_flush_all) +KVM_X86_OP(tlb_flush_current) +KVM_X86_OP_NULL(tlb_remote_flush) +KVM_X86_OP_NULL(tlb_remote_flush_with_range) +KVM_X86_OP(tlb_flush_gva) +KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(run) +KVM_X86_OP_NULL(handle_exit) +KVM_X86_OP_NULL(skip_emulated_instruction) +KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(set_interrupt_shadow) +KVM_X86_OP(get_interrupt_shadow) +KVM_X86_OP(patch_hypercall) +KVM_X86_OP(set_irq) +KVM_X86_OP(set_nmi) +KVM_X86_OP(queue_exception) +KVM_X86_OP(cancel_injection) +KVM_X86_OP(interrupt_allowed) +KVM_X86_OP(nmi_allowed) +KVM_X86_OP(get_nmi_mask) +KVM_X86_OP(set_nmi_mask) +KVM_X86_OP(enable_nmi_window) +KVM_X86_OP(enable_irq_window) +KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP(check_apicv_inhibit_reasons) +KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) +KVM_X86_OP(refresh_apicv_exec_ctrl) +KVM_X86_OP(hwapic_irr_update) +KVM_X86_OP(hwapic_isr_update) +KVM_X86_OP_NULL(guest_apic_has_interrupt) +KVM_X86_OP(load_eoi_exitmap) +KVM_X86_OP(set_virtual_apic_mode) +KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP(deliver_posted_interrupt) +KVM_X86_OP_NULL(sync_pir_to_irr) +KVM_X86_OP(set_tss_addr) +KVM_X86_OP(set_identity_map_addr) +KVM_X86_OP(get_mt_mask) +KVM_X86_OP(load_mmu_pgd) +KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(write_l1_tsc_offset) +KVM_X86_OP(get_exit_info) +KVM_X86_OP(check_intercept) +KVM_X86_OP(handle_exit_irqoff) +KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(sched_in) +KVM_X86_OP_NULL(update_cpu_dirty_logging) +KVM_X86_OP_NULL(pre_block) +KVM_X86_OP_NULL(post_block) +KVM_X86_OP_NULL(vcpu_blocking) +KVM_X86_OP_NULL(vcpu_unblocking) +KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(apicv_post_state_restore) +KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) +KVM_X86_OP_NULL(set_hv_timer) +KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP(setup_mce) +KVM_X86_OP(smi_allowed) +KVM_X86_OP(pre_enter_smm) +KVM_X86_OP(pre_leave_smm) +KVM_X86_OP(enable_smi_window) +KVM_X86_OP_NULL(mem_enc_op) +KVM_X86_OP_NULL(mem_enc_reg_region) +KVM_X86_OP_NULL(mem_enc_unreg_region) +KVM_X86_OP(get_msr_feature) +KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(apic_init_signal_blocked) +KVM_X86_OP_NULL(enable_direct_tlbflush) +KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP(msr_filter_changed) +KVM_X86_OP_NULL(complete_emulated_msr) + +#undef KVM_X86_OP +#undef KVM_X86_OP_NULL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e5f33a0d0e2..3768819693e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,10 +40,8 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 -#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -52,6 +50,9 @@ #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) +#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ + KVM_BUS_LOCK_DETECTION_EXIT) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -88,6 +89,8 @@ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) +#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ + KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -200,9 +203,17 @@ enum x86_intercept_stage; #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) -#define DR6_FIXED_1 0xfffe0ff0 -#define DR6_INIT 0xffff0ff0 +/* + * DR6_ACTIVE_LOW combines fixed-1 and active-low bits. + * We can regard all the bits in DR6_FIXED_1 as active_low bits; + * they will never be 0 for now, but when they are defined + * in the future it will require no code change. + * + * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. + */ +#define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_VOLATILE 0x0001e00f +#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) @@ -337,6 +348,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_HAVE_MMU_RWLOCK + struct kvm_mmu_page; /* @@ -358,8 +371,6 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); - void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte); hpa_t root_hpa; gpa_t root_pgd; union kvm_mmu_role mmu_role; @@ -510,6 +521,7 @@ struct kvm_vcpu_hv_synic { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + struct kvm_vcpu *vcpu; u32 vp_index; u64 hv_vapic; s64 runtime_offset; @@ -520,6 +532,21 @@ struct kvm_vcpu_hv { cpumask_t tlb_flush; }; +/* Xen HVM per vcpu emulation context */ +struct kvm_vcpu_xen { + u64 hypercall_rip; + u32 current_runstate; + bool vcpu_info_set; + bool vcpu_time_info_set; + bool runstate_set; + struct gfn_to_hva_cache vcpu_info_cache; + struct gfn_to_hva_cache vcpu_time_info_cache; + struct gfn_to_hva_cache runstate_cache; + u64 last_steal; + u64 runstate_entry_time; + u64 runstate_times[4]; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -614,6 +641,7 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; + void *guest_ins_data; u8 event_exit_inst_len; @@ -639,7 +667,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - unsigned long cr3_lm_rsvd_bits; + u64 reserved_gpa_bits; int maxphyaddr; int max_tdp_level; @@ -716,7 +744,9 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - struct kvm_vcpu_hv hyperv; + bool hyperv_enabled; + struct kvm_vcpu_hv *hyperv; + struct kvm_vcpu_xen xen; cpumask_var_t wbinvd_dirty_mask; @@ -805,6 +835,9 @@ struct kvm_vcpu_arch { */ bool enforce; } pv_cpuid; + + /* Protected Guests */ + bool guest_state_protected; }; struct kvm_lpage_info { @@ -851,12 +884,29 @@ struct kvm_hv_syndbg { u64 options; }; +/* Current state of Hyper-V TSC page clocksource */ +enum hv_tsc_page_status { + /* TSC page was not set up or disabled */ + HV_TSC_PAGE_UNSET = 0, + /* TSC page MSR was written by the guest, update pending */ + HV_TSC_PAGE_GUEST_CHANGED, + /* TSC page MSR was written by KVM userspace, update pending */ + HV_TSC_PAGE_HOST_CHANGED, + /* TSC page was properly set up and is currently active */ + HV_TSC_PAGE_SET, + /* TSC page is currently being updated and therefore is inactive */ + HV_TSC_PAGE_UPDATING, + /* TSC page was set up with an inaccessible GPA */ + HV_TSC_PAGE_BROKEN, +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; + enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; @@ -884,12 +934,26 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +/* Xen emulation context */ +struct kvm_xen { + bool long_mode; + bool shinfo_set; + u8 upcall_vector; + struct gfn_to_hva_cache shinfo_cache; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_x86_msr_filter { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; +}; + #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 @@ -904,9 +968,6 @@ struct kvm_arch { unsigned int indirect_shadow_pages; u8 mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; - /* - * Hash table of struct kvm_mmu_page. - */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; struct list_head lpage_disallowed_mmu_pages; @@ -925,7 +986,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -963,6 +1024,7 @@ struct kvm_arch { struct hlist_head mask_notifier_list; struct kvm_hv hyperv; + struct kvm_xen xen; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; @@ -973,6 +1035,7 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; @@ -985,18 +1048,16 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool bus_lock_detection_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct kvm_x86_msr_filter __rcu *msr_filter; - struct { - u8 count; - bool default_allow:1; - struct msr_bitmap_range ranges[16]; - } msr_filter; - - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; +#ifdef CONFIG_X86_64 /* * Whether the TDP MMU is enabled for this VM. This contains a * snapshot of the TDP MMU module parameter from when the VM was @@ -1006,16 +1067,41 @@ struct kvm_arch { */ bool tdp_mmu_enabled; - /* List of struct tdp_mmu_pages being used as roots */ + /* + * List of struct kvmp_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set. + * All struct kvm_mmu_pages in the list should have a positive + * root_count except when a thread holds the MMU lock and is removing + * an entry from the list. + */ struct list_head tdp_mmu_roots; - /* List of struct tdp_mmu_pages not being used as roots */ + + /* + * List of struct kvmp_mmu_pages not being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set and a root_count of 0. + */ struct list_head tdp_mmu_pages; + + /* + * Protects accesses to the following fields when the MMU lock + * is held in read mode: + * - tdp_mmu_pages (above) + * - the link field of struct kvm_mmu_pages used by the TDP MMU + * - lpage_disallowed_mmu_pages + * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * by the TDP MMU + * It is acceptable, but not necessary, to acquire this lock when + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; +#endif /* CONFIG_X86_64 */ }; struct kvm_vm_stat { ulong mmu_shadow_zapped; ulong mmu_pte_write; - ulong mmu_pte_updated; ulong mmu_pde_zapped; ulong mmu_flooded; ulong mmu_recycled; @@ -1088,7 +1174,7 @@ struct kvm_x86_ops { void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); - bool (*has_emulated_msr)(u32 index); + bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; @@ -1115,7 +1201,8 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1208,29 +1295,11 @@ struct kvm_x86_ops { void (*sched_in)(struct kvm_vcpu *kvm, int cpu); /* - * Arch-specific dirty logging hooks. These hooks are only supposed to - * be valid if the specific arch has hardware-accelerated dirty logging - * mechanism. Currently only for PML on VMX. - * - * - slot_enable_log_dirty: - * called when enabling log dirty mode for the slot. - * - slot_disable_log_dirty: - * called when disabling log dirty mode for the slot. - * also called when slot is created with log dirty disabled. - * - flush_log_dirty: - * called before reporting dirty_bitmap to userspace. - * - enable_log_dirty_pt_masked: - * called when reenabling log dirty for the GFNs in the mask after - * corresponding bits are cleared in slot->dirty_bitmap. + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero + * value indicates CPU dirty logging is unsupported or disabled. */ - void (*slot_enable_log_dirty)(struct kvm *kvm, - struct kvm_memory_slot *slot); - void (*slot_disable_log_dirty)(struct kvm *kvm, - struct kvm_memory_slot *slot); - void (*flush_log_dirty)(struct kvm *kvm); - void (*enable_log_dirty_pt_masked)(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t offset, unsigned long mask); + int cpu_dirty_log_size; + void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1280,6 +1349,9 @@ struct kvm_x86_ops { void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); + int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); + + void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); }; struct kvm_x86_nested_ops { @@ -1319,6 +1391,19 @@ extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; +#define KVM_X86_OP(func) \ + DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> + +static inline void kvm_ops_static_call_update(void) +{ +#define KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> +} + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1330,7 +1415,7 @@ void kvm_arch_free_vm(struct kvm *kvm); static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { if (kvm_x86_ops.tlb_remote_flush && - !kvm_x86_ops.tlb_remote_flush(kvm)) + !static_call(kvm_x86_tlb_remote_flush)(kvm)) return 0; else return -ENOTSUPP; @@ -1357,11 +1442,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); -void kvm_mmu_slot_set_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); -void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1400,6 +1480,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits; extern u64 kvm_max_tsc_scaling_ratio; /* 1ull << kvm_tsc_scaling_ratio_frac_bits */ extern u64 kvm_default_tsc_scaling_ratio; +/* bus lock detection supported? */ +extern bool kvm_has_bus_lock_exit; extern u64 kvm_mce_cap_supported; @@ -1461,6 +1543,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1470,12 +1553,16 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); +void kvm_free_guest_fpu(struct kvm_vcpu *vcpu); + +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); @@ -1526,7 +1613,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); @@ -1696,7 +1782,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); @@ -1715,14 +1802,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.vcpu_blocking) - kvm_x86_ops.vcpu_blocking(vcpu); + static_call_cond(kvm_x86_vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.vcpu_unblocking) - kvm_x86_ops.vcpu_unblocking(vcpu); + static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); } static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} @@ -1743,4 +1828,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) +int kvm_cpu_dirty_log_size(void); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 1fde1ab6559e..7c5cc6660e4b 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -12,9 +12,9 @@ #include <asm/setup.h> #include <linux/ftrace.h> -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - regs->ip = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/x86/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local64.h> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a0f147893a04..ddfb3cad8dff 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -177,7 +177,8 @@ enum mce_notifier_prios { MCE_PRIO_EXTLOG, MCE_PRIO_UC, MCE_PRIO_EARLY, - MCE_PRIO_CEC + MCE_PRIO_CEC, + MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; struct notifier_block; @@ -198,16 +199,22 @@ static inline void enable_copy_mc_fragile(void) } #endif +struct cper_ia_proc_ctx; + #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); void mcheck_vendor_init_severity(void); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline void mcheck_vendor_init_severity(void) {} +static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id) { return -EINVAL; } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -282,28 +289,6 @@ extern void (*mce_threshold_vector)(void); extern void (*deferred_error_int_vector)(void); /* - * Thermal handler - */ - -void intel_init_thermal(struct cpuinfo_x86 *c); - -/* Interrupt Handler for core thermal thresholds */ -extern int (*platform_thermal_notify)(__u64 msr_val); - -/* Interrupt Handler for package thermal thresholds */ -extern int (*platform_thermal_package_notify)(__u64 msr_val); - -/* Callback support of rate control, return true, if - * callback has rate control */ -extern bool (*platform_thermal_package_rate_control)(void); - -#ifdef CONFIG_X86_THERMAL_VECTOR -extern void mcheck_intel_therm_init(void); -#else -static inline void mcheck_intel_therm_init(void) { } -#endif - -/* * Used by APEI to report memory error via /dev/mcelog */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2f62bbdd9d12..31c4df123aa0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -37,6 +37,7 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); +void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -69,6 +70,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } +static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2b7cc5397f80..ab45a220fac4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE -int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); extern bool initrd_gone; #else -static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9257667d13c5..5d7494631ea9 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -6,6 +6,12 @@ #include <linux/rwsem.h> #include <linux/mutex.h> #include <linux/atomic.h> +#include <linux/bits.h> + +/* Uprobes on this MM assume 32-bit code */ +#define MM_CONTEXT_UPROBE_IA32 BIT(0) +/* vsyscall page is accessible on this MM */ +#define MM_CONTEXT_HAS_VSYSCALL BIT(1) /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -33,8 +39,7 @@ typedef struct { #endif #ifdef CONFIG_X86_64 - /* True if mm supports a task running in 32 bit compatibility mode. */ - unsigned short ia32_compat; + unsigned short flags; #endif struct mutex lock; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index d98016b83755..27516046117a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -91,12 +91,14 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif +#define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). */ +#define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -116,6 +118,8 @@ static inline int init_new_context(struct task_struct *tsk, init_new_context_ldt(mm); return 0; } + +#define destroy_context destroy_context static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); @@ -177,7 +181,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || - !(mm->context.ia32_compat == TIF_IA32); + !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); } #else static inline bool is_64bit_mm(struct mm_struct *mm) @@ -214,4 +218,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, unsigned long __get_current_cr3_fast(void); +#include <asm-generic/mmu_context.h> + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ffc289992d1b..ccf60a809a17 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -74,8 +74,17 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {} #if IS_ENABLED(CONFIG_HYPERV) +extern int hyperv_init_cpuhp; + extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; +extern void __percpu **hyperv_pcpu_output_arg; + +extern u64 hv_current_partition_id; + +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { @@ -237,6 +246,8 @@ int hyperv_fill_flush_guest_mapping_list( struct hv_guest_mapping_flush_list *flush, u64 start_gfn, u64 end_gfn); +extern bool hv_root_partition; + #ifdef CONFIG_X86_64 void hv_apic_init(void); void __init hv_init_spinlocks(void); @@ -248,10 +259,16 @@ static inline void hv_apic_init(void) {} static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, struct msi_desc *msi_desc) { - msi_entry->address = msi_desc->msg.address_lo; - msi_entry->data = msi_desc->msg.data; + msi_entry->address.as_uint32 = msi_desc->msg.address_lo; + msi_entry->data.as_uint32 = msi_desc->msg.data; } +struct irq_domain *hv_create_pci_msi_domain(void); + +int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, + struct hv_interrupt_entry *entry); +int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index cd30013d15d3..b85147d75626 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -9,4 +9,54 @@ typedef struct irq_alloc_info msi_alloc_info_t; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); +/* Structs and defines for the X86 specific MSI message format */ + +typedef struct x86_msi_data { + u32 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + reserved : 2, + active_low : 1, + is_level : 1; + + u32 dmar_subhandle; +} __attribute__ ((packed)) arch_msi_msg_data_t; +#define arch_msi_msg_data x86_msi_data + +typedef struct x86_msi_addr_lo { + union { + struct { + u32 reserved_0 : 2, + dest_mode_logical : 1, + redirect_hint : 1, + reserved_1 : 1, + virt_destid_8_14 : 7, + destid_0_7 : 8, + base_address : 12; + }; + struct { + u32 dmar_reserved_0 : 2, + dmar_index_15 : 1, + dmar_subhandle_valid : 1, + dmar_format : 1, + dmar_index_0_14 : 15, + dmar_base_address : 12; + }; + }; +} __attribute__ ((packed)) arch_msi_msg_addr_lo_t; +#define arch_msi_msg_addr_lo x86_msi_addr_lo + +#define X86_MSI_BASE_ADDRESS_LOW (0xfee00000 >> 20) + +typedef struct x86_msi_addr_hi { + u32 reserved : 8, + destid_8_31 : 24; +} __attribute__ ((packed)) arch_msi_msg_addr_hi_t; +#define arch_msi_msg_addr_hi x86_msi_addr_hi + +#define X86_MSI_BASE_ADDRESS_HIGH (0) + +struct msi_msg; +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h deleted file mode 100644 index ee2f8ccc32d0..000000000000 --- a/arch/x86/include/asm/msidef.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MSIDEF_H -#define _ASM_X86_MSIDEF_H - -/* - * Constants for Intel APIC based MSI messages. - */ - -/* - * Shifts for MSI data - */ - -#define MSI_DATA_VECTOR_SHIFT 0 -#define MSI_DATA_VECTOR_MASK 0x000000ff -#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \ - MSI_DATA_VECTOR_MASK) - -#define MSI_DATA_DELIVERY_MODE_SHIFT 8 -#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) -#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) - -#define MSI_DATA_LEVEL_SHIFT 14 -#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) -#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) - -#define MSI_DATA_TRIGGER_SHIFT 15 -#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) -#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) - -/* - * Shift/mask fields for msi address - */ - -#define MSI_ADDR_BASE_HI 0 -#define MSI_ADDR_BASE_LO 0xfee00000 - -#define MSI_ADDR_DEST_MODE_SHIFT 2 -#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT) -#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT) - -#define MSI_ADDR_REDIRECTION_SHIFT 3 -#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) - /* dedicated cpu */ -#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) - /* lowest priority */ - -#define MSI_ADDR_DEST_ID_SHIFT 12 -#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 -#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ - MSI_ADDR_DEST_ID_MASK) -#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00) - -#define MSI_ADDR_IR_EXT_INT (1 << 4) -#define MSI_ADDR_IR_SHV (1 << 3) -#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13) -#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5) -#endif /* _ASM_X86_MSIDEF_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 972a34d93505..546d6ecf0a35 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -139,6 +139,7 @@ #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_ERROR_CONTROL 0x0000017f #define MSR_IA32_MCG_EXT_CTL 0x000004d0 #define MSR_OFFCORE_RSP_0 0x000001a6 @@ -326,8 +327,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 -#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b #define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 +#define MSR_AMD_CORE_ENERGY_STATUS 0xc001029a +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 @@ -470,6 +472,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 @@ -609,6 +612,8 @@ #define FEAT_CTL_LOCKED BIT(0) #define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1) #define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2) +#define FEAT_CTL_SGX_LC_ENABLED BIT(17) +#define FEAT_CTL_SGX_ENABLED BIT(18) #define FEAT_CTL_LMCE_ENABLED BIT(20) #define MSR_IA32_TSC_ADJUST 0x0000003b @@ -628,6 +633,12 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +/* Intel SGX Launch Enclave Public Key Hash MSRs */ +#define MSR_IA32_SGXLEPUBKEYHASH0 0x0000008C +#define MSR_IA32_SGXLEPUBKEYHASH1 0x0000008D +#define MSR_IA32_SGXLEPUBKEYHASH2 0x0000008E +#define MSR_IA32_SGXLEPUBKEYHASH3 0x0000008F + #define MSR_IA32_SMM_MONITOR_CTL 0x0000009b #define MSR_IA32_SMBASE 0x0000009e diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 0b4920a7238e..e16cccdd0420 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -86,7 +86,7 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ -static inline unsigned long long notrace __rdmsr(unsigned int msr) +static __always_inline unsigned long long __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); @@ -98,7 +98,7 @@ static inline unsigned long long notrace __rdmsr(unsigned int msr) return EAX_EDX_VAL(val, low, high); } -static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) +static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 9d5d949e662e..1cb9c17a4cb4 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -9,7 +9,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index fdbffec4cfde..5a2baf28a1dc 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -40,6 +40,8 @@ #define ORC_REG_MAX 15 #ifndef __ASSEMBLY__ +#include <asm/byteorder.h> + /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -51,10 +53,18 @@ struct orc_entry { s16 sp_offset; s16 bp_offset; +#if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; unsigned end:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned bp_reg:4; + unsigned sp_reg:4; + unsigned unused:5; + unsigned end:1; + unsigned type:2; +#endif } __packed; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f462895a33e4..faf9cc1c14bb 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -53,7 +53,13 @@ #define STACK_TOP_MAX STACK_TOP /* - * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) + * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual + * address for the kernel image, rather than the limit on the size itself. On + * 32-bit, this is not a strict limit, but this value is used to limit the + * link-time virtual address range of the kernel, and by KASLR to limit the + * randomized address from which the kernel is executed. A relocatable kernel + * can be loaded somewhat higher than KERNEL_IMAGE_SIZE as long as enough space + * remains for the vmalloc area. */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 3f49dac03617..64297eabad63 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -66,7 +66,7 @@ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical * address, then that syscall will enter the kernel with a * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable + * We avoid this particular problem by preventing anything * from being mapped at the maximum canonical address. * * On AMD CPUs in the Ryzen family, there's a nasty bug in which the @@ -98,8 +98,10 @@ #define STACK_TOP_MAX TASK_SIZE_MAX /* - * Maximum kernel image size is limited to 1 GiB, due to the fixmap living - * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). + * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual + * address for the kernel image, rather than the limit on the size itself. + * This can be at most 1 GiB, due to the fixmap living in the next 1 GiB (see + * level2_kernel_pgt in arch/x86/kernel/head_64.S). * * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the * page tables are fully set up. diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d25cc6830e89..4abf110e2243 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void) return PVOP_CALLEE0(unsigned long, irq.save_fl); } -static inline notrace void arch_local_irq_restore(unsigned long f) -{ - PVOP_VCALLEE1(irq.restore_fl, f); -} - static inline notrace void arch_local_irq_disable(void) { PVOP_VCALLEE0(irq.irq_disable); @@ -776,31 +771,6 @@ extern void default_banner(void); #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL -/* - * If swapgs is used while the userspace stack is still current, - * there's no way to call a pvop. The PV replacement *must* be - * inlined, or the swapgs instruction must be trapped and emulated. - */ -#define SWAPGS_UNSAFE_STACK \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs) - -/* - * Note: swapgs is very special, and in practise is either going to be - * implemented with a single "swapgs" instruction or something very - * special. Either way, we don't need to save any registers for - * it. - */ -#define SWAPGS \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ - ) - -#define USERGS_SYSRET64 \ - PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);) - #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ @@ -812,17 +782,6 @@ extern void default_banner(void); #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_PARAVIRT_XXL - -#define GET_CR2_INTO_AX \ - PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ - ) - -#endif /* CONFIG_PARAVIRT_XXL */ - - #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0fad9f61c76a..de87087d3bde 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,7 +41,6 @@ #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> -#include <asm/kmap_types.h> #include <asm/pgtable_types.h> #include <asm/nospec-branch.h> @@ -157,20 +156,10 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); - void (*swapgs)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -179,16 +168,13 @@ struct pv_cpu_ops { struct pv_irq_ops { #ifdef CONFIG_PARAVIRT_XXL /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. + * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF; + * all other bits returned from save_fl are undefined. * * NOTE: These functions callers expect the callee to preserve * more registers than the standard C calling convention. */ struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b9a7fd0a27e2..544f41a179fb 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -261,8 +261,12 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1) #define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2) #define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3) -#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND -#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \ +#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4) +#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5) +#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6) +#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7) +#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND +#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \ INTEL_PMC_MSK_FIXED_SLOTS) /* @@ -280,8 +284,14 @@ struct x86_pmu_capability { #define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */ #define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */ #define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */ -#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND -#define INTEL_TD_METRIC_NUM 4 +/* Level 2 metrics */ +#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */ +#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */ +#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */ +#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */ + +#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND +#define INTEL_TD_METRIC_NUM 8 static inline bool is_metric_idx(int idx) { @@ -483,11 +493,7 @@ static inline void perf_check_microcode(void) { } extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ - *nr = 0; - return NULL; -} +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index d7acae4120d5..7c9c968a42ef 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -57,19 +57,13 @@ do { \ #endif /* - * This is how much memory in addition to the memory covered up to - * and including _end we need mapped initially. - * We need: - * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) - * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * This is used to calculate the .brk reservation for initial pagetables. + * Enough space is reserved to allocate pagetables sufficient to cover all + * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of + * lowmem. * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - * - * KERNEL_IMAGE_SIZE should be greater than pa(_end) - * and small than max_low_pfn, otherwise will waste some page table entries + * With PAE paging (PTRS_PER_PMD > 1), we allocate PTRS_PER_PGD == 4 pages for + * the PMD's in addition to the pages required for the last level pagetables. */ #if PTRS_PER_PMD > 1 #define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..91ac10654570 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END _AC(0xffffffffff000000, UL) +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define MODULES_END _AC(0xffffffffff000000, UL) +#else +# define MODULES_END _AC(0xfffffffffe000000, UL) +#endif #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 394757ee030a..f24d7ef8fffa 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -177,8 +177,6 @@ enum page_cache_mode { #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) -#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) - #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 16b9f220bdeb..40f92270515b 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -10,8 +10,6 @@ #ifndef _PLATFORM_SST_AUDIO_H_ #define _PLATFORM_SST_AUDIO_H_ -#include <linux/sfi.h> - #define MAX_NUM_STREAMS_MRFLD 25 #define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 69485ca13665..f8cb8af4de5c 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -5,6 +5,7 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> #include <linux/thread_info.h> +#include <linux/static_call_types.h> DECLARE_PER_CPU(int, __preempt_count); @@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION - extern asmlinkage void preempt_schedule_thunk(void); -# define __preempt_schedule() \ - asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) - extern asmlinkage void preempt_schedule(void); - extern asmlinkage void preempt_schedule_notrace_thunk(void); -# define __preempt_schedule_notrace() \ - asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) +extern asmlinkage void preempt_schedule(void); +extern asmlinkage void preempt_schedule_thunk(void); - extern asmlinkage void preempt_schedule_notrace(void); -#endif +#define __preempt_schedule_func preempt_schedule_thunk + +extern asmlinkage void preempt_schedule_notrace(void); +extern asmlinkage void preempt_schedule_notrace_thunk(void); + +#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk + +#ifdef CONFIG_PREEMPT_DYNAMIC + +DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); + +#define __preempt_schedule() \ +do { \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ +} while (0) + +DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); + +#define __preempt_schedule_notrace() \ +do { \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ +} while (0) + +#else /* PREEMPT_DYNAMIC */ + +#define __preempt_schedule() \ + asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT); + +#define __preempt_schedule_notrace() \ + asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT); + +#endif /* PREEMPT_DYNAMIC */ + +#endif /* PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 82a08b585818..f1b9ed5efaa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -426,8 +426,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); - #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -454,7 +452,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(unsigned int, irq_count); +DECLARE_PER_CPU(void *, hardirq_stack_ptr); +DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -473,9 +472,9 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* Per CPU softirq stack pointer */ +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); -#endif /* X86_64 */ +#endif /* !X86_64 */ extern unsigned int fpu_kernel_xstate_size; extern unsigned int fpu_user_xstate_size; @@ -552,15 +551,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, *size = fpu_kernel_xstate_size; } -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ - static inline void native_load_sp0(unsigned long sp0) { @@ -813,10 +803,8 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD -extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); #else -static inline u16 amd_get_nb_id(int cpu) { return 0; } static inline u32 amd_get_nodes_per_socket(void) { return 0; } #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..b6a9d51d1d79 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include <asm/paravirt_types.h> #endif +#include <asm/proto.h> + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 3ff0d48469f2..b2d504f11937 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 07603064df8f..d60ed0668a59 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -56,19 +56,22 @@ static void __resctrl_sched_in(void) struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); u32 closid = state->default_closid; u32 rmid = state->default_rmid; + u32 tmp; /* * If this task has a closid/rmid assigned, use it. * Else use the closid/rmid assigned to this cpu. */ if (static_branch_likely(&rdt_alloc_enable_key)) { - if (current->closid) - closid = current->closid; + tmp = READ_ONCE(current->closid); + if (tmp) + closid = tmp; } if (static_branch_likely(&rdt_mon_enable_key)) { - if (current->rmid) - rmid = current->rmid; + tmp = READ_ONCE(current->rmid); + if (tmp) + rmid = tmp; } if (closid != state->cur_closid || rmid != state->cur_rmid) { diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 2bd1338de236..fef16e398161 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -16,6 +16,26 @@ #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn #endif +#ifdef CONFIG_X86_64 +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "x86_64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 +# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ia32" +# endif +/* + * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support + * caching them and they are treated as out of range syscalls, which will + * always pass through the BPF filter. + */ +#else /* !CONFIG_X86_64 */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ia32" +#endif + #include <asm-generic/seccomp.h> #endif /* _ASM_X86_SECCOMP_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 5948218f35c5..4352f08bfbb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8b58d6975d5d..0bc9b0895f33 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/softirq_stack.h b/arch/x86/include/asm/softirq_stack.h new file mode 100644 index 000000000000..889d53d6a0e1 --- /dev/null +++ b/arch/x86/include/asm/softirq_stack.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SOFTIRQ_STACK_H +#define _ASM_X86_SOFTIRQ_STACK_H + +#ifdef CONFIG_X86_64 +# include <asm/irq_stack.h> +#else +# include <asm-generic/softirq_stack.h> +#endif + +#endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index cc177b4431ae..1d3cbaef4bb7 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -243,10 +243,10 @@ static inline void serialize(void) } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void *dst, const void *src) +static inline void movdir64b(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } *__dst = dst; + struct { char _[64]; } __iomem *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src) static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } *__dst = dst; + struct { char _[64]; } __iomem *__dst = dst; int zf; /* diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 49600643faba..f248eb2ac2d4 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -88,9 +88,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) return (unsigned long *)task->thread.sp; } -void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, const char *log_lvl); - /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index c37f11999d0c..cbb67b6030f9 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -37,4 +37,11 @@ #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop") + +#define ARCH_ADD_TRAMP_KEY(name) \ + asm(".pushsection .static_call_tramp_key, \"a\" \n" \ + ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \ + ".long " STATIC_CALL_KEY_STR(name) " - . \n" \ + ".popsection \n") + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 71d630bb5e08..1c561945b426 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -98,6 +98,16 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + TRAP_EFER_WRITE, + TRAP_CR0_WRITE, + TRAP_CR1_WRITE, + TRAP_CR2_WRITE, + TRAP_CR3_WRITE, + TRAP_CR4_WRITE, + TRAP_CR5_WRITE, + TRAP_CR6_WRITE, + TRAP_CR7_WRITE, + TRAP_CR8_WRITE, /* Byte offset 014h (word 5) */ INTERCEPT_INVLPGB = 160, INTERCEPT_INVLPGB_ILLEGAL, @@ -130,7 +140,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 exit_int_info_err; u64 nested_ctl; u64 avic_vapic_bar; - u8 reserved_4[8]; + u64 ghcb_gpa; u32 event_inj; u32 event_inj_err; u64 nested_cr3; @@ -144,6 +154,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[8]; + u64 vmsa_pa; /* Used for an SEV-ES guest */ }; @@ -178,7 +190,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) -#define SVM_INTERRUPT_SHADOW_MASK 1 +#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0) +#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1) #define SVM_IOIO_STR_SHIFT 2 #define SVM_IOIO_REP_SHIFT 3 @@ -197,6 +210,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) +#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) struct vmcb_seg { u16 selector; @@ -220,7 +234,8 @@ struct vmcb_save_area { u8 cpl; u8 reserved_2[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_3[104]; + u64 xss; /* Valid for SEV-ES only */ u64 cr4; u64 cr3; u64 cr0; @@ -251,9 +266,12 @@ struct vmcb_save_area { /* * The following part of the save area is valid only for - * SEV-ES guests when referenced through the GHCB. + * SEV-ES guests when referenced through the GHCB or for + * saving to the host save area. */ - u8 reserved_7[104]; + u8 reserved_7[80]; + u32 pkru; + u8 reserved_7a[20]; u64 reserved_8; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; @@ -294,7 +312,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) @@ -379,6 +397,16 @@ struct vmcb { (unsigned long *)&ghcb->save.valid_bitmap); \ } \ \ + static inline u64 ghcb_get_##field(struct ghcb *ghcb) \ + { \ + return ghcb->save.field; \ + } \ + \ + static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ + { \ + return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \ + } \ + \ static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ { \ __set_bit(GHCB_BITMAP_IDX(field), \ diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h new file mode 100644 index 000000000000..ddbdefd5b94f --- /dev/null +++ b/arch/x86/include/asm/thermal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_THERMAL_H +#define _ASM_X86_THERMAL_H + +#ifdef CONFIG_X86_THERMAL_VECTOR +void intel_init_thermal(struct cpuinfo_x86 *c); +bool x86_thermal_enabled(void); +void intel_thermal_interrupt(void); +#else +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +#endif + +#endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 44733a4bfc42..06b740bae431 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,6 +55,7 @@ struct task_struct; struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ }; @@ -74,15 +75,11 @@ struct thread_info { * - these are process state flags that various assembly files * may need to access */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ -#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -91,7 +88,7 @@ struct thread_info { #define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_IA32 17 /* IA32 compatibility process */ +#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -99,19 +96,13 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ -#define TIF_X32 30 /* 32-bit native x86-64 binary */ -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) @@ -120,16 +111,14 @@ struct thread_info { #define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_IA32 (1 << TIF_IA32) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) -#define _TIF_X32 (1 << TIF_X32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ @@ -216,10 +205,23 @@ static inline int arch_within_stack_frames(const void * const stack, #endif +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + +#ifndef __ASSEMBLY__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ + +#define arch_set_restart_data(restart) \ + do { restart->arch_data = current_thread_info()->status; } while (0) + #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_32 #define in_ia32_syscall() true diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 820082bd6880..1bfe979bb9bc 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -4,7 +4,6 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f4234575f3fd..9239399e5491 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -110,6 +110,8 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +extern unsigned int __max_die_per_package; + #ifdef CONFIG_SMP #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) @@ -118,8 +120,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) -extern unsigned int __max_die_per_package; - static inline int topology_max_die_per_package(void) { return __max_die_per_package; @@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) } #endif +#ifdef CONFIG_ACPI_CPPC_LIB +void init_freq_invariance_cppc(void); +#define init_freq_invariance_cppc init_freq_invariance_cppc +#endif + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 305bc1214aef..10b1de500ab1 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { X86_PF_PROT = 1 << 0, @@ -19,6 +20,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SGX = 1 << 15, }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 664d4610d700..8e574c0afef8 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -48,17 +48,8 @@ UNWIND_HINT_REGS base=\base offset=\offset partial=1 .endm -.macro UNWIND_HINT_FUNC sp_offset=8 - UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL -.endm - -/* - * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN - * and sibling calls. On these, sp_offset denotes the expected offset from - * initial_func_cfi. - */ -.macro UNWIND_HINT_RET_OFFSET sp_offset=8 - UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset +.macro UNWIND_HINT_FUNC + UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 08b3d810dfba..1b6455f881f9 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -28,6 +28,20 @@ enum uv_bios_cmd { UV_BIOS_SET_LEGACY_VGA_TARGET }; +#define UV_BIOS_EXTRA 0x10000 +#define UV_BIOS_GET_PCI_TOPOLOGY 0x10001 +#define UV_BIOS_GET_GEOINFO 0x10003 + +#define UV_BIOS_EXTRA_OP_MEM_COPYIN 0x1000 +#define UV_BIOS_EXTRA_OP_MEM_COPYOUT 0x2000 +#define UV_BIOS_EXTRA_OP_MASK 0x0fff +#define UV_BIOS_EXTRA_GET_HEAPSIZE 1 +#define UV_BIOS_EXTRA_INSTALL_HEAP 2 +#define UV_BIOS_EXTRA_MASTER_NASID 3 +#define UV_BIOS_EXTRA_OBJECT_COUNT (10|UV_BIOS_EXTRA_OP_MEM_COPYOUT) +#define UV_BIOS_EXTRA_ENUM_OBJECTS (12|UV_BIOS_EXTRA_OP_MEM_COPYOUT) +#define UV_BIOS_EXTRA_ENUM_PORTS (13|UV_BIOS_EXTRA_OP_MEM_COPYOUT) + /* * Status values returned from a BIOS call. */ @@ -109,6 +123,32 @@ struct uv_systab { } entry[1]; /* additional entries follow */ }; extern struct uv_systab *uv_systab; + +#define UV_BIOS_MAXSTRING 128 +struct uv_bios_hub_info { + unsigned int id; + union { + struct { + unsigned long long this_part:1; + unsigned long long is_shared:1; + unsigned long long is_disabled:1; + } fields; + struct { + unsigned long long flags; + unsigned long long reserved; + } b; + } f; + char name[UV_BIOS_MAXSTRING]; + char location[UV_BIOS_MAXSTRING]; + unsigned int ports; +}; + +struct uv_bios_port_info { + unsigned int port; + unsigned int conn_id; + unsigned int conn_port; +}; + /* (... end of definitions from UV BIOS ...) */ enum { @@ -142,6 +182,15 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); +extern s64 uv_bios_get_master_nasid(u64 sz, u64 *nasid); +extern s64 uv_bios_get_heapsize(u64 nasid, u64 sz, u64 *heap_sz); +extern s64 uv_bios_install_heap(u64 nasid, u64 sz, u64 *heap); +extern s64 uv_bios_obj_count(u64 nasid, u64 sz, u64 *objcnt); +extern s64 uv_bios_enum_objs(u64 nasid, u64 sz, u64 *objbuf); +extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 sz, u64 *portbuf); +extern s64 uv_bios_get_geoinfo(u64 nasid, u64 sz, u64 *geo); +extern s64 uv_bios_get_pci_topology(u64 sz, u64 *buf); + extern int uv_bios_init(void); extern unsigned long get_uv_systab_phys(bool msg); @@ -151,6 +200,8 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; +extern ssize_t uv_get_archtype(char *buf, int len); +extern int uv_get_hubless_system(void); extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h new file mode 100644 index 000000000000..f241451035fb --- /dev/null +++ b/arch/x86/include/asm/uv/uv_geo.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2020 Hewlett Packard Enterprise Development LP. All rights reserved. + */ + +#ifndef _ASM_UV_GEO_H +#define _ASM_UV_GEO_H + +/* Type declaractions */ + +/* Size of a geoid_s structure (must be before decl. of geoid_u) */ +#define GEOID_SIZE 8 + +/* Fields common to all substructures */ +struct geo_common_s { + unsigned char type; /* What type of h/w is named by this geoid_s */ + unsigned char blade; + unsigned char slot; /* slot is IRU */ + unsigned char upos; + unsigned char rack; +}; + +/* Additional fields for particular types of hardware */ +struct geo_node_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_rtr_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_iocntl_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_pcicard_s { + struct geo_iocntl_s common; + char bus; /* Bus/widget number */ + char slot; /* PCI slot number */ +}; + +/* Subcomponents of a node */ +struct geo_cpu_s { + struct geo_node_s node; + unsigned char socket:4, /* Which CPU on the node */ + thread:4; + unsigned char core; +}; + +struct geo_mem_s { + struct geo_node_s node; + char membus; /* The memory bus on the node */ + char memslot; /* The memory slot on the bus */ +}; + +union geoid_u { + struct geo_common_s common; + struct geo_node_s node; + struct geo_iocntl_s iocntl; + struct geo_pcicard_s pcicard; + struct geo_rtr_s rtr; + struct geo_cpu_s cpu; + struct geo_mem_s mem; + char padsize[GEOID_SIZE]; +}; + +/* Defined constants */ + +#define GEO_MAX_LEN 48 + +#define GEO_TYPE_INVALID 0 +#define GEO_TYPE_MODULE 1 +#define GEO_TYPE_NODE 2 +#define GEO_TYPE_RTR 3 +#define GEO_TYPE_IOCNTL 4 +#define GEO_TYPE_IOCARD 5 +#define GEO_TYPE_CPU 6 +#define GEO_TYPE_MEM 7 +#define GEO_TYPE_MAX (GEO_TYPE_MEM+1) + +static inline int geo_rack(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.rack; +} + +static inline int geo_slot(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.upos; +} + +static inline int geo_blade(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.blade * 2 + g.common.slot; +} + +#endif /* _ASM_UV_GEO_H */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bbcdc7b8f963..98aa103eb4ab 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -15,6 +15,8 @@ struct vdso_image { unsigned long size; /* Always a multiple of PAGE_SIZE */ unsigned long alt, alt_len; + unsigned long extable_base, extable_len; + const void *extable; long sym_vvar_start; /* Negative offset to the vvar area */ @@ -27,6 +29,8 @@ struct vdso_image { long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; long sym_int80_landing_pad; + long sym_vdso32_sigreturn_landing_pad; + long sym_vdso32_rt_sigreturn_landing_pad; }; #ifdef CONFIG_X86_64 @@ -45,6 +49,9 @@ extern void __init init_vdso_image(const struct vdso_image *image); extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); +extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 9aad0e0876fb..8757078d4442 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void) } -/** Disable VMX on the current CPU +/** + * cpu_vmxoff() - Disable VMX on the current CPU * - * vmxoff causes a undefined-opcode exception if vmxon was not run - * on the CPU previously. Only call this function if you know VMX - * is enabled. + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. */ -static inline void cpu_vmxoff(void) +static inline int cpu_vmxoff(void) { - asm volatile ("vmxoff"); + asm_volatile_goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + + cr4_clear_bits(X86_CR4_VMXE); + return 0; + +fault: cr4_clear_bits(X86_CR4_VMXE); + return -EIO; } static inline int cpu_vmx_enabled(void) diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 26efbec94448..9e8ac5073ecb 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -36,7 +36,6 @@ struct vm86 { unsigned long saved_sp0; unsigned long flags; - unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f8ba5289ecb0..358707f60d99 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -73,6 +73,7 @@ #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) +#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) @@ -113,6 +114,7 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 #define VMX_MISC_ZERO_LEN_INS 0x40000000 #define VMX_MISC_MSR_LIST_MULTIPLIER 512 diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 9915990fd8cf..d9a74681a77d 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -83,5 +83,6 @@ #define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index dde5b3f1e7cd..5c69f7eb5d47 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -116,6 +116,7 @@ struct x86_init_pci { * @init_platform: platform setup * @guest_late_init: guest late init * @x2apic_available: X2APIC detection + * @msi_ext_dest_id: MSI supports 15-bit APIC IDs * @init_mem_mapping: setup early mappings during init_mem_mapping() * @init_after_bootmem: guest init after boot allocator is finished */ @@ -123,6 +124,7 @@ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); + bool (*msi_ext_dest_id)(void); void (*init_mem_mapping)(void); void (*init_after_bootmem)(void); }; diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 9139b3e86316..baca0b00ef76 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -182,6 +182,9 @@ struct arch_shared_info { unsigned long p2m_cr3; /* cr3 value of the p2m address space */ unsigned long p2m_vaddr; /* virtual address of the p2m list */ unsigned long p2m_generation; /* generation count of p2m mapping */ +#ifdef CONFIG_X86_32 + uint32_t wc_sec_hi; +#endif }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 5941e18edd5a..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -355,7 +355,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); -#define xen_remap(cookie, size) ioremap((cookie), (size)); +#define xen_remap(cookie, size) ioremap((cookie), (size)) #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 89e5f3d1bba8..5a3022c8af82 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -12,6 +12,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define DE_VECTOR 0 #define DB_VECTOR 1 @@ -111,6 +112,7 @@ struct kvm_ioapic_state { #define KVM_NR_IRQCHIPS 3 #define KVM_RUN_X86_SMM (1 << 0) +#define KVM_RUN_X86_BUS_LOCK (1 << 1) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h new file mode 100644 index 000000000000..9034f3007c4e --- /dev/null +++ b/arch/x86/include/uapi/asm/sgx.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ +#ifndef _UAPI_ASM_X86_SGX_H +#define _UAPI_ASM_X86_SGX_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/** + * enum sgx_page_flags - page control flags + * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of + * ENCLS[EEXTEND] operations. + */ +enum sgx_page_flags { + SGX_PAGE_MEASURE = 0x01, +}; + +#define SGX_MAGIC 0xA4 + +#define SGX_IOC_ENCLAVE_CREATE \ + _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create) +#define SGX_IOC_ENCLAVE_ADD_PAGES \ + _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages) +#define SGX_IOC_ENCLAVE_INIT \ + _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) +#define SGX_IOC_ENCLAVE_PROVISION \ + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) + +/** + * struct sgx_enclave_create - parameter structure for the + * %SGX_IOC_ENCLAVE_CREATE ioctl + * @src: address for the SECS page data + */ +struct sgx_enclave_create { + __u64 src; +}; + +/** + * struct sgx_enclave_add_pages - parameter structure for the + * %SGX_IOC_ENCLAVE_ADD_PAGE ioctl + * @src: start address for the page data + * @offset: starting page offset + * @length: length of the data (multiple of the page size) + * @secinfo: address for the SECINFO data + * @flags: page control flags + * @count: number of bytes added (multiple of the page size) + */ +struct sgx_enclave_add_pages { + __u64 src; + __u64 offset; + __u64 length; + __u64 secinfo; + __u64 flags; + __u64 count; +}; + +/** + * struct sgx_enclave_init - parameter structure for the + * %SGX_IOC_ENCLAVE_INIT ioctl + * @sigstruct: address for the SIGSTRUCT data + */ +struct sgx_enclave_init { + __u64 sigstruct; +}; + +/** + * struct sgx_enclave_provision - parameter structure for the + * %SGX_IOC_ENCLAVE_PROVISION ioctl + * @fd: file handle of /dev/sgx_provision + */ +struct sgx_enclave_provision { + __u64 fd; +}; + +struct sgx_enclave_run; + +/** + * typedef sgx_enclave_user_handler_t - Exit handler function accepted by + * __vdso_sgx_enter_enclave() + * @run: The run instance given by the caller + * + * The register parameters contain the snapshot of their values at enclave + * exit. An invalid ENCLU function number will cause -EINVAL to be returned + * to the caller. + * + * Return: + * - <= 0: The given value is returned back to the caller. + * - > 0: ENCLU function to invoke, either EENTER or ERESUME. + */ +typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx, + long rsp, long r8, long r9, + struct sgx_enclave_run *run); + +/** + * struct sgx_enclave_run - the execution context of __vdso_sgx_enter_enclave() + * @tcs: TCS used to enter the enclave + * @function: The last seen ENCLU function (EENTER, ERESUME or EEXIT) + * @exception_vector: The interrupt vector of the exception + * @exception_error_code: The exception error code pulled out of the stack + * @exception_addr: The address that triggered the exception + * @user_handler: User provided callback run on exception + * @user_data: Data passed to the user handler + * @reserved Reserved for future extensions + * + * If @user_handler is provided, the handler will be invoked on all return paths + * of the normal flow. The user handler may transfer control, e.g. via a + * longjmp() call or a C++ exception, without returning to + * __vdso_sgx_enter_enclave(). + */ +struct sgx_enclave_run { + __u64 tcs; + __u32 function; + __u16 exception_vector; + __u16 exception_error_code; + __u64 exception_addr; + __u64 user_handler; + __u64 user_data; + __u8 reserved[216]; +}; + +/** + * typedef vdso_sgx_enter_enclave_t - Prototype for __vdso_sgx_enter_enclave(), + * a vDSO function to enter an SGX enclave. + * @rdi: Pass-through value for RDI + * @rsi: Pass-through value for RSI + * @rdx: Pass-through value for RDX + * @function: ENCLU function, must be EENTER or ERESUME + * @r8: Pass-through value for R8 + * @r9: Pass-through value for R9 + * @run: struct sgx_enclave_run, must be non-NULL + * + * NOTE: __vdso_sgx_enter_enclave() does not ensure full compliance with the + * x86-64 ABI, e.g. doesn't handle XSAVE state. Except for non-volatile + * general purpose registers, EFLAGS.DF, and RSP alignment, preserving/setting + * state in accordance with the x86-64 ABI is the responsibility of the enclave + * and its runtime, i.e. __vdso_sgx_enter_enclave() cannot be called from C + * code without careful consideration by both the enclave and its runtime. + * + * All general purpose registers except RAX, RBX and RCX are passed as-is to the + * enclave. RAX, RBX and RCX are consumed by EENTER and ERESUME and are loaded + * with @function, asynchronous exit pointer, and @run.tcs respectively. + * + * RBP and the stack are used to anchor __vdso_sgx_enter_enclave() to the + * pre-enclave state, e.g. to retrieve @run.exception and @run.user_handler + * after an enclave exit. All other registers are available for use by the + * enclave and its runtime, e.g. an enclave can push additional data onto the + * stack (and modify RSP) to pass information to the optional user handler (see + * below). + * + * Most exceptions reported on ENCLU, including those that occur within the + * enclave, are fixed up and reported synchronously instead of being delivered + * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are + * never fixed up and are always delivered via standard signals. On synchrously + * reported exceptions, -EFAULT is returned and details about the exception are + * recorded in @run.exception, the optional sgx_enclave_exception struct. + * + * Return: + * - 0: ENCLU function was successfully executed. + * - -EINVAL: Invalid ENCL number (neither EENTER nor ERESUME). + */ +typedef int (*vdso_sgx_enter_enclave_t)(unsigned long rdi, unsigned long rsi, + unsigned long rdx, unsigned int function, + unsigned long r8, unsigned long r9, + struct sgx_enclave_run *run); + +#endif /* _UAPI_ASM_X86_SGX_H */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index e5745d593dc7..164a22a72984 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -62,30 +62,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001u -#define SA_NOCLDWAIT 0x00000002u -#define SA_SIGINFO 0x00000004u -#define SA_ONSTACK 0x08000000u -#define SA_RESTART 0x10000000u -#define SA_NODEFER 0x40000000u -#define SA_RESETHAND 0x80000000u - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index f1d8307454e0..554f75fe013c 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -77,10 +77,28 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_EFER_WRITE_TRAP 0x08f +#define SVM_EXIT_CR0_WRITE_TRAP 0x090 +#define SVM_EXIT_CR1_WRITE_TRAP 0x091 +#define SVM_EXIT_CR2_WRITE_TRAP 0x092 +#define SVM_EXIT_CR3_WRITE_TRAP 0x093 +#define SVM_EXIT_CR4_WRITE_TRAP 0x094 +#define SVM_EXIT_CR5_WRITE_TRAP 0x095 +#define SVM_EXIT_CR6_WRITE_TRAP 0x096 +#define SVM_EXIT_CR7_WRITE_TRAP 0x097 +#define SVM_EXIT_CR8_WRITE_TRAP 0x098 +#define SVM_EXIT_CR9_WRITE_TRAP 0x099 +#define SVM_EXIT_CR10_WRITE_TRAP 0x09a +#define SVM_EXIT_CR11_WRITE_TRAP 0x09b +#define SVM_EXIT_CR12_WRITE_TRAP 0x09c +#define SVM_EXIT_CR13_WRITE_TRAP 0x09d +#define SVM_EXIT_CR14_WRITE_TRAP 0x09e +#define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 +#define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ #define SVM_VMGEXIT_MMIO_READ 0x80000001 @@ -183,10 +201,20 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \ + { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \ + { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ + { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_VMGEXIT, "vmgexit" }, \ + { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \ + { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \ + { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ + { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ + { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h index d2ee4e307ef8..18909b8050bc 100644 --- a/arch/x86/include/uapi/asm/vm86.h +++ b/arch/x86/include/uapi/asm/vm86.h @@ -97,7 +97,7 @@ struct revectored_struct { struct vm86_struct { struct vm86_regs regs; unsigned long flags; - unsigned long screen_bitmap; + unsigned long screen_bitmap; /* unused, preserved by vm86() */ unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; @@ -106,7 +106,7 @@ struct vm86_struct { /* * flags masks */ -#define VM86_SCREEN_BITMAP 0x0001 +#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */ struct vm86plus_info_struct { unsigned long force_return_for_pic:1; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d5..b8e650a985e3 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -32,6 +32,7 @@ #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 +#define EXIT_REASON_SIPI_SIGNAL 4 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -88,12 +89,14 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_BUS_LOCK 74 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ + { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ @@ -148,7 +151,8 @@ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ - { EXIT_REASON_TPAUSE, "TPAUSE" } + { EXIT_REASON_TPAUSE, "TPAUSE" }, \ + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 68608bd892c0..2ddf08351f0b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -116,7 +116,6 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o @@ -161,5 +160,3 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif - -obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index f1bb57b0e41e..cf340d85946a 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c22fb55abcfd..0916f00a992e 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + return apei_smca_report_x86_error(ctx_info, lapic_id); +} diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index c8daa92f38dc..56b6865afb2a 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0-only */ .text #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/segment.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> #include <asm/frame.h> +#include <asm/nospec-branch.h> # Copyright 2003 Pavel Machek <pavel@suse.cz @@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64) movq saved_rbp, %rbp movq saved_rip, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -112,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. @@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel) FRAME_END jmp restore_processor_state SYM_FUNC_END(do_suspend_lowlevel) +STACK_FRAME_NON_STANDARD do_suspend_lowlevel .data saved_rbp: .quad 0 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2400ad62f330..8d778e46725d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1374,7 +1374,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 18f6b7c4bd79..b4396952c9a6 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -384,7 +384,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -398,7 +398,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); unsigned int reg; int cuid; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c deleted file mode 100644 index 263eeaddb0aa..000000000000 --- a/arch/x86/kernel/apb_timer.c +++ /dev/null @@ -1,347 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * apb_timer.c: Driver for Langwell APB timers - * - * (C) Copyright 2009 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * Note: - * Langwell is the south complex of Intel Moorestown MID platform. There are - * eight external timers in total that can be used by the operating system. - * The timer information, such as frequency and addresses, is provided to the - * OS via SFI tables. - * Timer interrupts are routed via FW/HW emulated IOAPIC independently via - * individual redirection table entries (RTE). - * Unlike HPET, there is no master counter, therefore one of the timers are - * used as clocksource. The overall allocation looks like: - * - timer 0 - NR_CPUs for per cpu timer - * - one timer for clocksource - * - one timer for watchdog driver. - * It is also worth notice that APB timer does not support true one-shot mode, - * free-running mode will be used here to emulate one-shot mode. - * APB timer can also be used as broadcast timer along with per cpu local APIC - * timer, but by default APB timer has higher rating than local APIC timers. - */ - -#include <linux/delay.h> -#include <linux/dw_apb_timer.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/sfi.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/irq.h> - -#include <asm/fixmap.h> -#include <asm/apb_timer.h> -#include <asm/intel-mid.h> -#include <asm/time.h> - -#define APBT_CLOCKEVENT_RATING 110 -#define APBT_CLOCKSOURCE_RATING 250 - -#define APBT_CLOCKEVENT0_NUM (0) -#define APBT_CLOCKSOURCE_NUM (2) - -static phys_addr_t apbt_address; -static int apb_timer_block_enabled; -static void __iomem *apbt_virt_address; - -/* - * Common DW APB timer info - */ -static unsigned long apbt_freq; - -struct apbt_dev { - struct dw_apb_clock_event_device *timer; - unsigned int num; - int cpu; - unsigned int irq; - char name[10]; -}; - -static struct dw_apb_clocksource *clocksource_apbt; - -static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) -{ - return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; -} - -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); - -#ifdef CONFIG_SMP -static unsigned int apbt_num_timers_used; -#endif - -static inline void apbt_set_mapping(void) -{ - struct sfi_timer_table_entry *mtmr; - int phy_cs_timer_id = 0; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (phys_addr_t)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE); - if (!apbt_virt_address) { - pr_debug("Failed mapping APBT phy address at %lu\n",\ - (unsigned long)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - pr_debug("Use timer %d for clocksource\n", - (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / - APBTMRS_REG_SIZE; - - clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, - "apbt0", apbt_virt_address + phy_cs_timer_id * - APBTMRS_REG_SIZE, apbt_freq); - return; - -panic_noapbt: - panic("Failed to setup APB system timer\n"); - -} - -static inline void apbt_clear_mapping(void) -{ - iounmap(apbt_virt_address); - apbt_virt_address = NULL; -} - -static int __init apbt_clockevent_register(void) -{ - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - adev->num = smp_processor_id(); - adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", - intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? - APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, - adev_virt_addr(adev), 0, apbt_freq); - /* Firmware does EOI handling for us. */ - adev->timer->eoi = NULL; - - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - global_clock_event = &adev->timer->ced; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - dw_apb_clockevent_register(adev->timer); - - sfi_free_mtmr(mtmr); - return 0; -} - -#ifdef CONFIG_SMP - -static void apbt_setup_irq(struct apbt_dev *adev) -{ - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); -} - -/* Should be called with per cpu */ -void apbt_setup_secondary_clock(void) -{ - struct apbt_dev *adev; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (!cpu) - return; - - adev = this_cpu_ptr(&cpu_apbt_dev); - if (!adev->timer) { - adev->timer = dw_apb_clockevent_init(cpu, adev->name, - APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), - adev->irq, apbt_freq); - adev->timer->eoi = NULL; - } else { - dw_apb_clockevent_resume(adev->timer); - } - - printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", - cpu, adev->name, adev->cpu); - - apbt_setup_irq(adev); - dw_apb_clockevent_register(adev->timer); - - return; -} - -/* - * this notify handler process CPU hotplug events. in case of S0i3, nonboot - * cpus are disabled/enabled frequently, for performance reasons, we keep the - * per cpu timer irq registered so that we do need to do free_irq/request_irq. - * - * TODO: it might be more reliable to directly disable percpu clockevent device - * without the notifier chain. currently, cpu 0 may get interrupts from other - * cpu timers during the offline process due to the ordering of notification. - * the extra interrupt is harmless. - */ -static int apbt_cpu_dead(unsigned int cpu) -{ - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %u offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %u offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - return 0; -} - -static __init int apbt_late_init(void) -{ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || - !apb_timer_block_enabled) - return 0; - return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, - apbt_cpu_dead); -} -fs_initcall(apbt_late_init); -#else - -void apbt_setup_secondary_clock(void) {} - -#endif /* CONFIG_SMP */ - -static int apbt_clocksource_register(void) -{ - u64 start, now; - u64 t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - dw_apb_clocksource_start(clocksource_apbt); - - /* Verify whether apbt counter works */ - t1 = dw_apb_clocksource_read(clocksource_apbt); - start = rdtsc(); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - now = rdtsc(); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == dw_apb_clocksource_read(clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - dw_apb_clocksource_register(clocksource_apbt); - - return 0; -} - -/* - * Early setup the APBT timer, only use timer 0 for booting then switch to - * per CPU timer if possible. - * returns 1 if per cpu apbt is setup - * returns 0 if no per cpu apbt is chosen - * panic if set up failed, this is the only platform timer on Moorestown. - */ -void __init apbt_time_init(void) -{ -#ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - struct apbt_dev *adev; -#endif - - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (!apbt_virt_address) - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } -#ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) - apbt_num_timers_used = num_possible_cpus(); - else - apbt_num_timers_used = 1; - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) - adev->irq = p_mtmr->irq; - else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); - } -#endif - - return; - -out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); -} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b3eef1d5c903..4f26700f314d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -41,6 +41,7 @@ #include <asm/perf_event.h> #include <asm/x86_init.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> @@ -94,6 +95,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; /* + * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID + */ +static bool virt_ext_dest_id __ro_after_init; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -472,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; @@ -1591,7 +1600,7 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - if (apic->dest_logical) { + if (apic->dest_mode_logical) { int logical_apicid, ldr_apicid; /* @@ -1738,6 +1747,7 @@ void apic_ap_setup(void) #ifdef CONFIG_X86_X2APIC int x2apic_mode; +EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, @@ -1841,20 +1851,34 @@ static __init void try_to_enable_x2apic(int remap_mode) return; if (remap_mode != IRQ_REMAP_X2APIC_MODE) { - /* IR is required if there is APIC ID > 255 even when running - * under KVM + u32 apic_limit = 255; + + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. */ - if (max_physical_apicid > 255 || - !x86_init.hyper.x2apic_available()) { + if (!x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; } /* - * without IR all CPUs can be addressed by IOAPIC/MSI - * only in physical mode + * If the hypervisor supports extended destination ID in + * MSI, that increases the maximum APIC ID that can be + * used for non-remapped IRQ domains. */ + if (x86_init.hyper.msi_ext_dest_id()) { + virt_ext_dest_id = 1; + apic_limit = 32767; + } + + /* + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannnot + * be addressed must not be brought online. + */ + x2apic_set_max_apicid(apic_limit); x2apic_phys = 1; } x2apic_enable(); @@ -2114,18 +2138,11 @@ void __init register_lapic_address(unsigned long address) * Local APIC interrupts */ -/** - * spurious_interrupt - Catch all for interrupts raised on unused vectors - * @regs: Pointer to pt_regs on stack - * @vector: The vector number - * - * This is invoked from ASM entry code to catch all interrupts which - * trigger on an entry which is routed to the common_spurious idtentry - * point. - * - * Also called from sysvec_spurious_apic_interrupt(). +/* + * Common handling code for spurious_interrupt and spurious_vector entry + * points below. No point in allowing the compiler to inline it twice. */ -DEFINE_IDTENTRY_IRQ(spurious_interrupt) +static noinline void handle_spurious_interrupt(u8 vector) { u32 v; @@ -2160,9 +2177,23 @@ out: trace_spurious_apic_exit(vector); } +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + */ +DEFINE_IDTENTRY_IRQ(spurious_interrupt) +{ + handle_spurious_interrupt(vector); +} + DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) { - __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + handle_spurious_interrupt(SPURIOUS_APIC_VECTOR); } /* @@ -2311,6 +2342,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread @@ -2478,6 +2514,46 @@ int hard_smp_processor_id(void) return read_apic_id(); } +void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical; + msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF; + + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED; + msg->arch_data.vector = cfg->vector; + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address APICs which can't be addressed in the normal 32-bit + * address range at 0xFFExxxxx. That is typically just 8 bits, but + * some hypervisors allow the extended destination ID field in bits + * 5-11 to be used, giving support for 15 bits of APIC IDs in total. + */ + if (dmar) + msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; + else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000) + msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8; + else + WARN_ON_ONCE(cfg->dest_apicid > 0xFF); +} + +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) +{ + u32 dest = msg->arch_addr_lo.destid_0_7; + + if (extid) + dest |= msg->arch_addr_hi.destid_8_31 << 8; + return dest; +} +EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7862b152a052..8f72b4351c9f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector) unsigned long flags; local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = physflat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 780c702969b7..fe78319e0f7a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -95,19 +95,15 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 35edd57f064a..a54d817eb4b6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 98d015a4405a..77555f66c14d 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -127,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* phys delivery to target CPU: */ - .irq_dest_mode = 0, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = bigsmp_check_apicid_used, + .check_apicid_used = bigsmp_check_apicid_used, .init_apic_ldr = bigsmp_init_apic_ldr, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7b3c7e0d4a09..73ff4dd426a8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,6 +48,7 @@ #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/msi.h> #include <asm/irqdomain.h> #include <asm/io.h> @@ -63,7 +64,6 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hw_irq.h> - #include <asm/apic.h> #define for_each_ioapic(idx) \ @@ -89,12 +89,12 @@ struct irq_pin_list { }; struct mp_chip_data { - struct list_head irq_2_pin; - struct IO_APIC_route_entry entry; - int trigger; - int polarity; + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + bool is_level; + bool active_low; + bool isa_irq; u32 count; - bool isa_irq; }; struct mp_ioapic_gsi { @@ -198,7 +198,7 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +/* Will be called in mpparse/ACPI codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { int i; @@ -286,31 +286,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + entry.w1 = io_apic_read(apic, 0x10 + 2 * pin); + entry.w2 = io_apic_read(apic, 0x11 + 2 * pin); - return eu.entry; + return entry; } static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - eu.entry = __ioapic_read_entry(apic, pin); + entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; + return entry; } /* @@ -321,11 +316,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) */ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu = {{0, 0}}; - - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); } static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) @@ -344,12 +336,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) */ static void ioapic_mask_entry(int apic, int pin) { + struct IO_APIC_route_entry e = { .masked = true }; unsigned long flags; - union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -422,20 +414,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void io_apic_modify_irq(struct mp_chip_data *data, - int mask_and, int mask_or, +static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, void (*final)(struct irq_pin_list *entry)) { - union entry_union eu; struct irq_pin_list *entry; - eu.entry = data->entry; - eu.w1 &= mask_and; - eu.w1 |= mask_or; - data->entry = eu.entry; + data->entry.masked = masked; for_each_irq_pin(entry, data->irq_2_pin) { - io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1); if (final) final(entry); } @@ -459,13 +446,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, true, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, false, NULL); } static void unmask_ioapic_irq(struct irq_data *irq_data) @@ -506,8 +493,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = IOAPIC_MASKED; - entry1.trigger = IOAPIC_EDGE; + entry1.masked = true; + entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); @@ -535,15 +522,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) + if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI) return; /* * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -556,8 +543,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (entry.trigger == IOAPIC_EDGE) { - entry.trigger = IOAPIC_LEVEL; + if (!entry.is_level) { + entry.is_level = true; ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -659,8 +646,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); } } @@ -745,44 +732,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#ifdef CONFIG_EISA -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always active high edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (IOAPIC_EDGE) -#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always active low level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (IOAPIC_LEVEL) -#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) - -static int irq_polarity(int idx) +static bool irq_active_low(int idx) { int bus = mp_irqs[idx].srcbus; @@ -791,90 +741,139 @@ static int irq_polarity(int idx) */ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { case MP_IRQPOL_DEFAULT: - /* conforms to spec, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - return default_ISA_polarity(idx); - else - return default_PCI_polarity(idx); + /* + * Conforms to spec, ie. bus-type dependent polarity. PCI + * defaults to low active. [E]ISA defaults to high active. + */ + return !test_bit(bus, mp_bus_not_pci); case MP_IRQPOL_ACTIVE_HIGH: - return IOAPIC_POL_HIGH; + return false; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); fallthrough; case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_POL_LOW; + return true; } } #ifdef CONFIG_EISA -static int eisa_irq_trigger(int idx, int bus, int trigger) +/* + * EISA Edge/Level control register, ELCR + */ +static bool EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return false; +} + +/* + * EISA interrupts are always active high and can be edge or level + * triggered depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must be + * read in from the ELCR. + */ +static bool eisa_irq_is_level(int idx, int bus, bool level) { switch (mp_bus_id_to_type[bus]) { case MP_BUS_PCI: case MP_BUS_ISA: - return trigger; + return level; case MP_BUS_EISA: - return default_EISA_trigger(idx); + return EISA_ELCR(mp_irqs[idx].srcbusirq); } pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); - return IOAPIC_LEVEL; + return true; } #else -static inline int eisa_irq_trigger(int idx, int bus, int trigger) +static inline int eisa_irq_is_level(int idx, int bus, bool level) { - return trigger; + return level; } #endif -static int irq_trigger(int idx) +static bool irq_is_level(int idx) { int bus = mp_irqs[idx].srcbus; - int trigger; + bool level; /* * Determine IRQ trigger mode (edge or level sensitive): */ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { case MP_IRQTRIG_DEFAULT: - /* conforms to spec, ie. bus-type dependent trigger mode */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); + /* + * Conforms to spec, ie. bus-type dependent trigger + * mode. PCI defaults to level, ISA to edge. + */ + level = !test_bit(bus, mp_bus_not_pci); /* Take EISA into account */ - return eisa_irq_trigger(idx, bus, trigger); + return eisa_irq_is_level(idx, bus, level); case MP_IRQTRIG_EDGE: - return IOAPIC_EDGE; + return false; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); fallthrough; case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_LEVEL; + return true; } } +static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_is_level(idx); + *polarity = irq_active_low(idx); + return 0; +} + +#ifdef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low) +{ + *is_level = *active_low = 0; + return __acpi_get_override_irq(gsi, (bool *)is_level, + (bool *)active_low); +} +#endif + void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity) { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; info->ioapic.node = node; - info->ioapic.trigger = trigger; - info->ioapic.polarity = polarity; + info->ioapic.is_level = trigger; + info->ioapic.active_low = polarity; info->ioapic.valid = 1; } -#ifndef CONFIG_ACPI -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); -#endif - static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, struct irq_alloc_info *src, u32 gsi, int ioapic_idx, int pin) { - int trigger, polarity; + bool level, pol_low; copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; @@ -883,20 +882,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic.valid = 1; if (src && src->ioapic.valid) { dst->ioapic.node = src->ioapic.node; - dst->ioapic.trigger = src->ioapic.trigger; - dst->ioapic.polarity = src->ioapic.polarity; + dst->ioapic.is_level = src->ioapic.is_level; + dst->ioapic.active_low = src->ioapic.active_low; } else { dst->ioapic.node = NUMA_NO_NODE; - if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic.trigger = trigger; - dst->ioapic.polarity = polarity; + if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) { + dst->ioapic.is_level = level; + dst->ioapic.active_low = pol_low; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic.trigger = IOAPIC_LEVEL; - dst->ioapic.polarity = IOAPIC_POL_LOW; + dst->ioapic.is_level = true; + dst->ioapic.active_low = true; } } } @@ -906,12 +905,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info) return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } -static void mp_register_handler(unsigned int irq, unsigned long trigger) +static void mp_register_handler(unsigned int irq, bool level) { irq_flow_handler_t hdl; bool fasteoi; - if (trigger) { + if (level) { irq_set_status_flags(irq, IRQ_LEVEL); fasteoi = true; } else { @@ -933,14 +932,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic.trigger != data->trigger) - mp_register_handler(irq, info->ioapic.trigger); - data->entry.trigger = data->trigger = info->ioapic.trigger; - data->entry.polarity = data->polarity = info->ioapic.polarity; + if (info->ioapic.is_level != data->is_level) + mp_register_handler(irq, info->ioapic.is_level); + data->entry.is_level = data->is_level = info->ioapic.is_level; + data->entry.active_low = data->active_low = info->ioapic.active_low; } - return data->trigger == info->ioapic.trigger && - data->polarity == info->ioapic.polarity; + return data->is_level == info->ioapic.is_level && + data->active_low == info->ioapic.active_low; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -1033,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); @@ -1219,10 +1228,9 @@ void ioapic_zap_locks(void) static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { - int i; - char buf[256]; struct IO_APIC_route_entry entry; - struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + char buf[256]; + int i; printk(KERN_DEBUG "IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { @@ -1230,20 +1238,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", i, - entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", - entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", - entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.masked ? "disabled" : "enabled ", + entry.is_level ? "level" : "edge ", + entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); - if (ir_entry->format) + if (entry.ir_format) { printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index2 << 15) | ir_entry->index, - ir_entry->zero); - else - printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, - entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? - "logical " : "physical", - entry.dest, entry.delivery_mode); + (entry.ir_index_15 << 15) | entry.ir_index_0_14, + entry.ir_zero); + } else { + printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, + entry.delivery_mode); + } } } @@ -1368,7 +1377,8 @@ void __init enable_IO_APIC(void) /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + if (!entry.masked && + entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; @@ -1410,14 +1420,16 @@ void native_restore_boot_irq_mode(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; + u32 apic_id = read_apic_id(); memset(&entry, 0, sizeof(entry)); - entry.mask = IOAPIC_UNMASKED; - entry.trigger = IOAPIC_EDGE; - entry.polarity = IOAPIC_POL_HIGH; - entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry.delivery_mode = dest_ExtINT; - entry.dest = read_apic_id(); + entry.masked = false; + entry.is_level = false; + entry.active_low = false; + entry.dest_mode_logical = false; + entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry.destid_0_7 = apic_id & 0xFF; + entry.virt_destid_8_14 = apic_id >> 8; /* * Add it to the IO-APIC irq-routing table: @@ -1618,21 +1630,16 @@ static void __init delay_without_tsc(void) static int __init timer_irq_works(void) { unsigned long t1 = jiffies; - unsigned long flags; if (no_timer_check) return 1; - local_save_flags(flags); local_irq_enable(); - if (boot_cpu_has(X86_FEATURE_TSC)) delay_with_tsc(); else delay_without_tsc(); - local_irq_restore(flags); - /* * Expect a few ticks at least, to be sure some possible * glue logic does not lock up after one or two first @@ -1641,10 +1648,10 @@ static int __init timer_irq_works(void) * least one tick may be lost due to delays. */ - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; + local_irq_disable(); + + /* Did jiffies advance? */ + return time_after(jiffies, t1 + 4); } /* @@ -1696,13 +1703,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; + struct IO_APIC_route_entry e; int pin; pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); + e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { + if (e.irr) { raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } @@ -1849,21 +1856,62 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +/* + * The I/OAPIC is just a device for generating MSI messages from legacy + * interrupt pins. Various fields of the RTE translate into bits of the + * resulting MSI which had a historical meaning. + * + * With interrupt remapping, many of those bits have different meanings + * in the underlying MSI, but the way that the I/OAPIC transforms them + * from its RTE to the MSI message is the same. This function allows + * the parent IRQ domain to compose the MSI message, then takes the + * relevant bits to put them in the appropriate places in the RTE in + * order to generate that message when the IRQ happens. + * + * The setup here relies on a preconfigured route entry (is_level, + * active_low, masked) because the parent domain is merely composing the + * generic message routing information which is used for the MSI. + */ +static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, + struct IO_APIC_route_entry *entry) +{ + struct msi_msg msg; + + /* Let the parent domain compose the MSI message */ + irq_chip_compose_msi_msg(irq_data, &msg); + + /* + * - Real vector + * - DMAR/IR: 8bit subhandle (ioapic.pin) + * - AMD/IR: 8bit IRTE index + */ + entry->vector = msg.arch_data.vector; + /* Delivery mode (for DMAR/IR all 0) */ + entry->delivery_mode = msg.arch_data.delivery_mode; + /* Destination mode or DMAR/IR index bit 15 */ + entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical; + /* DMAR/IR: 1, 0 for all other modes */ + entry->ir_format = msg.arch_addr_lo.dmar_format; + /* + * - DMAR/IR: index bit 0-14. + * + * - Virt: If the host supports x2apic without a virtualized IR + * unit then bit 0-6 of dmar_index_0_14 are providing bit + * 8-14 of the destination id. + * + * All other modes have bit 0-6 of dmar_index_0_14 cleared and the + * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). + */ + entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14; +} + static void ioapic_configure_entry(struct irq_data *irqd) { struct mp_chip_data *mpd = irqd->chip_data; - struct irq_cfg *cfg = irqd_cfg(irqd); struct irq_pin_list *entry; - /* - * Only update when the parent is the vector domain, don't touch it - * if the parent is the remapping domain. Check the installed - * ioapic chip to verify that. - */ - if (irqd->chip == &ioapic_chip) { - mpd->entry.dest = cfg->dest_apicid; - mpd->entry.vector = cfg->vector; - } + ioapic_setup_msg_from_msi(irqd, &mpd->entry); + for_each_irq_pin(entry, mpd->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } @@ -1919,7 +1967,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, * irrelevant because the IO-APIC treats them as fire and * forget. */ - if (rentry.irr && rentry.trigger) { + if (rentry.irr && rentry.is_level) { *state = true; break; } @@ -2027,6 +2075,7 @@ static inline void __init unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); if (pin == -1) { @@ -2042,14 +2091,16 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); + apic_id = hard_smp_processor_id(); memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry1.mask = IOAPIC_UNMASKED; - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = IOAPIC_EDGE; + entry1.dest_mode_logical = true; + entry1.masked = false; + entry1.destid_0_7 = apic_id & 0xFF; + entry1.virt_destid_8_14 = apic_id >> 8; + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry1.active_low = entry0.active_low; + entry1.is_level = false; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2117,13 +2168,12 @@ static inline void __init check_timer(void) struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; - unsigned long flags; int no_pin1 = 0; if (!global_clock_event) return; - local_irq_save(flags); + local_irq_disable(); /* * get/set the timer IRQ vector: @@ -2178,9 +2228,9 @@ static inline void __init check_timer(void) * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ - int idx; - idx = find_irq_entry(apic1, pin1, mp_INT); - if (idx != -1 && irq_trigger(idx)) + int idx = find_irq_entry(apic1, pin1, mp_INT); + + if (idx != -1 && irq_is_level(idx)) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); @@ -2191,7 +2241,6 @@ static inline void __init check_timer(void) goto out; } panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); - local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2215,7 +2264,6 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ - local_irq_disable(); legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -2232,7 +2280,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -2251,7 +2298,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); if (apic_is_x2apic_enabled()) apic_printk(APIC_QUIET, KERN_INFO @@ -2260,7 +2306,7 @@ static inline void __init check_timer(void) panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: - local_irq_restore(flags); + local_irq_enable(); } /* @@ -2284,36 +2330,37 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); struct fwnode_handle *fn; - char *name = "IO-APIC"; + struct irq_fwspec fwspec; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; - info.devid = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_irq_domain(&info); - if (!parent) - parent = x86_vector_domain; - else - name = "IO-APIC-IR"; - /* Handle device tree enumerated APICs proper */ if (cfg->dev) { fn = of_node_to_fwnode(cfg->dev); } else { - fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) return -ENOMEM; } + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = mpc_ioapic_id(ioapic); + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENODEV; + } + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); @@ -2587,30 +2634,6 @@ static int io_apic_get_version(int ioapic) return reg_01.bits.version; } -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) -{ - int ioapic, pin, idx; - - if (skip_ioapic_setup) - return -1; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -1; - - pin = mp_find_ioapic_pin(ioapic, gsi); - if (pin < 0) - return -1; - - idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - return -1; - - *trigger = irq_trigger(idx); - *polarity = irq_polarity(idx); - return 0; -} - /* * This function updates target affinity of IOAPIC interrupts to include * the CPUs which came online during SMP bringup. @@ -2850,7 +2873,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, /* * If mp_register_ioapic() is called during early boot stage when - * walking ACPI/SFI/DT tables, it's too early to create irqdomain, + * walking ACPI/DT tables, it's too early to create irqdomain, * we are still using bootmem allocator. So delay it to setup_IO_APIC(). */ if (hotplug) { @@ -2934,44 +2957,49 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { if (info && info->ioapic.valid) { - data->trigger = info->ioapic.trigger; - data->polarity = info->ioapic.polarity; - } else if (acpi_get_override_irq(gsi, &data->trigger, - &data->polarity) < 0) { + data->is_level = info->ioapic.is_level; + data->active_low = info->ioapic.active_low; + } else if (__acpi_get_override_irq(gsi, &data->is_level, + &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ - data->trigger = IOAPIC_LEVEL; - data->polarity = IOAPIC_POL_LOW; + data->is_level = true; + data->active_low = true; } } -static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, - struct IO_APIC_route_entry *entry) +/* + * Configure the I/O-APIC specific fields in the routing entry. + * + * This is important to setup the I/O-APIC specific bits (is_level, + * active_low, masked) because the underlying parent domain will only + * provide the routing information and is oblivious of the I/O-APIC + * specific bits. + * + * The entry is just preconfigured at this point and not written into the + * RTE. This happens later during activation which will fill in the actual + * routing information. + */ +static void mp_preconfigure_entry(struct mp_chip_data *data) { + struct IO_APIC_route_entry *entry = &data->entry; + memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = cfg->dest_apicid; - entry->vector = cfg->vector; - entry->trigger = data->trigger; - entry->polarity = data->polarity; + entry->is_level = data->is_level; + entry->active_low = data->active_low; /* * Mask level triggered irqs. Edge triggered irqs are masked * by the irq core code in case they fire. */ - if (data->trigger == IOAPIC_LEVEL) - entry->mask = IOAPIC_MASKED; - else - entry->mask = IOAPIC_UNMASKED; + entry->masked = data->is_level; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - int ret, ioapic, pin; - struct irq_cfg *cfg; - struct irq_data *irq_data; - struct mp_chip_data *data; struct irq_alloc_info *info = arg; + struct mp_chip_data *data; + struct irq_data *irq_data; + int ret, ioapic, pin; unsigned long flags; if (!info || nr_irqs > 1) @@ -2989,7 +3017,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic.entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -3003,22 +3030,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + mp_preconfigure_entry(data); + mp_register_handler(virq, data->is_level); + local_irq_save(flags); - if (info->ioapic.entry) - mp_setup_entry(cfg, data, info->ioapic.entry); - mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", - ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, - virq, data->trigger, data->polarity, cfg->dest_apicid); - + "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, + data->is_level, data->active_low); return 0; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 387154e39e08..d1fb874fbe64 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, continue; __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); } local_irq_restore(flags); } @@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 6313f0a05db7..44ebe25e7703 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -15,7 +15,6 @@ #include <linux/hpet.h> #include <linux/msi.h> #include <asm/irqdomain.h> -#include <asm/msidef.h> #include <asm/hpet.h> #include <asm/hw_irq.h> #include <asm/apic.h> @@ -23,38 +22,11 @@ struct irq_domain *x86_pci_msi_default_domain __ro_after_init; -static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - MSI_DATA_DELIVERY_FIXED | - MSI_DATA_VECTOR(cfg->vector); -} - -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) -{ - __irq_msi_compose_msg(irqd_cfg(data), msg); -} - static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) { struct msi_msg msg[2] = { [1] = { }, }; - __irq_msi_compose_msg(cfg, msg); + __irq_msi_compose_msg(cfg, msg, false); irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); } @@ -276,6 +248,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, #endif #ifdef CONFIG_DMAR_TABLE +/* + * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the + * high bits of the destination APIC ID. This can't be done in the general + * case for MSIs as it would be targeting real memory above 4GiB not the + * APIC. + */ +static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, true); +} + static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -288,6 +271,7 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -356,114 +340,3 @@ void dmar_free_hwirq(int irq) irq_domain_free_irqs(irq, 1); } #endif - -/* - * MSI message composition - */ -#ifdef CONFIG_HPET_TIMER -static inline int hpet_dev_id(struct irq_domain *domain) -{ - struct msi_domain_info *info = msi_get_domain_info(domain); - - return (int)(long)info->data; -} - -static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) -{ - hpet_msi_write(irq_data_get_irq_handler_data(data), msg); -} - -static struct irq_chip hpet_msi_controller __ro_after_init = { - .name = "HPET-MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static int hpet_msi_init(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq, - irq_hw_number_t hwirq, msi_alloc_info_t *arg) -{ - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, - handle_edge_irq, arg->data, "edge"); - - return 0; -} - -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); -} - -static struct msi_domain_ops hpet_msi_domain_ops = { - .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, -}; - -static struct msi_domain_info hpet_msi_domain_info = { - .ops = &hpet_msi_domain_ops, - .chip = &hpet_msi_controller, - .flags = MSI_FLAG_USE_DEF_DOM_OPS, -}; - -struct irq_domain *hpet_create_irq_domain(int hpet_id) -{ - struct msi_domain_info *domain_info; - struct irq_domain *parent, *d; - struct irq_alloc_info info; - struct fwnode_handle *fn; - - if (x86_vector_domain == NULL) - return NULL; - - domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); - if (!domain_info) - return NULL; - - *domain_info = hpet_msi_domain_info; - domain_info->data = (void *)(long)hpet_id; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; - info.devid = hpet_id; - parent = irq_remapping_get_irq_domain(&info); - if (parent == NULL) - parent = x86_vector_domain; - else - hpet_msi_controller.name = "IR-HPET-MSI"; - - fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, - hpet_id); - if (!fn) { - kfree(domain_info); - return NULL; - } - - d = msi_create_irq_domain(fn, domain_info, parent); - if (!d) { - irq_domain_free_fwnode(fn); - kfree(domain_info); - } - return d; -} - -int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, - int dev_num) -{ - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.data = hc; - info.devid = hpet_dev_id(domain); - info.hwirq = dev_num; - - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); -} -#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 67b6f7c049ec..a61f642b1b90 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -69,16 +69,13 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 758bbf25ef74..3c9c7492252f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -640,7 +640,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "IO-APIC-", 8) && + simple_strtol(fwname+8, NULL, 10) == fwspec->param[0]; + } + return to_of_node(fwspec->fwnode) && + of_device_is_compatible(to_of_node(fwspec->fwnode), + "intel,ce4100-ioapic"); +} + +int x86_fwspec_is_hpet(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "HPET-MSI-", 9) && + simple_strtol(fwname+9, NULL, 10) == fwspec->param[0]; + } + return 0; +} + +static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* + * HPET and I/OAPIC cannot be parented in the vector domain + * if IRQ remapping is enabled. APIC IDs above 15 bits are + * only permitted if IRQ remapping is enabled, so check that. + */ + if (apic->apic_id_valid(32768)) + return 0; + + return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); +} + static const struct irq_domain_ops x86_vector_domain_ops = { + .select = x86_vector_select, .alloc = x86_vector_alloc_irqs, .free = x86_vector_free_irqs, .activate = x86_vector_activate, @@ -822,6 +865,12 @@ void apic_ack_edge(struct irq_data *irqd) apic_ack_irq(irqd); } +static void x86_vector_msi_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, false); +} + static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b0889c48a2ac..f4da9bb69a88 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); } @@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long flags; u32 dest; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); @@ -61,7 +63,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) if (!dest) continue; - __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } @@ -184,15 +186,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bc9693841353..6bde05a86b4e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,6 +8,12 @@ int x2apic_phys; static struct apic apic_x2apic_phys; +static u32 x2apic_max_apicid __ro_after_init; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; +} static int __init set_x2apic_phys_mode(char *arg) { @@ -37,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); } @@ -48,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long this_cpu; unsigned long flags; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); @@ -98,6 +106,9 @@ static int x2apic_phys_probe(void) /* Common x2apic functions, also used by x2apic_cluster */ int x2apic_apic_id_valid(u32 apicid) { + if (x2apic_max_apicid && apicid > x2apic_max_apicid) + return 0; + return 1; } @@ -116,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which) { unsigned long cfg = __prepare_ICR(which, vector, 0); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); native_x2apic_icr_write(cfg, 0); } @@ -148,15 +160,13 @@ static struct apic apic_x2apic_phys __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 235f5cde06fc..52bc217ca8c3 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -502,6 +502,18 @@ enum uv_system_type get_uv_system_type(void) return uv_system_type; } +int uv_get_hubless_system(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(uv_get_hubless_system); + +ssize_t uv_get_archtype(char *buf, int len) +{ + return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id); +} +EXPORT_SYMBOL_GPL(uv_get_archtype); + int is_uv_system(void) { return uv_system_type != UV_NONE; @@ -716,9 +728,9 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long dmode, val; if (vector == NMI_VECTOR) - dmode = dest_NMI; + dmode = APIC_DELIVERY_MODE_NMI; else - dmode = dest_Fixed; + dmode = APIC_DELIVERY_MODE_FIXED; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -820,15 +832,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* Physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -1603,21 +1613,30 @@ static void check_efi_reboot(void) reboot_type = BOOT_ACPI; } -/* Setup user proc fs files */ +/* + * User proc fs file handling now deprecated. + * Recommend using /sys/firmware/sgi_uv/... instead. + */ static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubbed_system); return 0; } static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubless_system); return 0; } static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n", + current->comm); seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); return 0; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 70b7154f4bdd..60b9f42ce3c1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -66,7 +66,6 @@ static void __used common(void) OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); - OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); #endif #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 828be792231e..b14533af7676 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,9 +13,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT_XXL - OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, - cpu.usergs_sysret64); - OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); #endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93792b457b81..637b499450d1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ +obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 0b2c03943ac6..23f5f27b5a02 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,6 +10,8 @@ */ #include <linux/interrupt.h> + +#include <asm/acrn.h> #include <asm/apic.h> #include <asm/cpufeatures.h> #include <asm/desc.h> @@ -19,7 +21,7 @@ static u32 __init acrn_detect(void) { - return hypervisor_cpuid_base("ACRNACRNACRN", 0); + return acrn_cpuid_base(); } static void __init acrn_init_platform(void) @@ -55,6 +57,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) set_irq_regs(old_regs); } +void acrn_setup_intr_handler(void (*handler)(void)) +{ + acrn_intr_handler = handler; +} +EXPORT_SYMBOL_GPL(acrn_setup_intr_handler); + +void acrn_remove_intr_handler(void) +{ + acrn_intr_handler = NULL; +} +EXPORT_SYMBOL_GPL(acrn_remove_intr_handler); + const __initconst struct hypervisor_x86 x86_hyper_acrn = { .name = "ACRN", .detect = acrn_detect, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6062ce586b95..347a956f71ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -23,7 +23,6 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/set_memory.h> #endif #include "cpu.h" @@ -330,7 +329,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -340,7 +338,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -360,15 +358,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu, node_id); + cacheinfo_amd_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -393,7 +391,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } static void amd_detect_ppin(struct cpuinfo_x86 *c) @@ -425,12 +423,6 @@ clear_ppin: clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); } -u16 amd_get_nb_id(int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(amd_get_nb_id); - u32 amd_get_nodes_per_socket(void) { return nodes_per_socket; @@ -516,26 +508,6 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c) static void bsp_init_amd(struct cpuinfo_x86 *c) { - -#ifdef CONFIG_X86_64 - if (c->x86 >= 0xf) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { if (c->x86 > 0x10 || @@ -570,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e2f319dc992d..22911deacb6e 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,11 +14,13 @@ #include <linux/cpufreq.h> #include <linux/smp.h> #include <linux/sched/isolation.h> +#include <linux/rcupdate.h> #include "cpu.h" struct aperfmperf_sample { unsigned int khz; + atomic_t scfpending; ktime_t time; u64 aperf; u64 mperf; @@ -62,17 +64,20 @@ static void aperfmperf_snapshot_khz(void *dummy) s->aperf = aperf; s->mperf = mperf; s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + atomic_set_release(&s->scfpending, 0); } static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) { s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); /* Don't bother re-computing within the cache threshold time. */ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return true; - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + if (!atomic_xchg(&s->scfpending, 1) || wait) + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); /* Return false if the previous iteration was too long ago. */ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; @@ -89,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) return 0; + if (rcu_is_idle_cpu(cpu)) + return 0; /* Idle CPUs are completely uninteresting. */ + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -108,6 +116,8 @@ void arch_freq_prepare_all(void) for_each_online_cpu(cpu) { if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) continue; + if (rcu_is_idle_cpu(cpu)) + continue; /* Idle CPUs are completely uninteresting. */ if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; } @@ -118,6 +128,8 @@ void arch_freq_prepare_all(void) unsigned int arch_freq_get_on_cpu(int cpu) { + struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + if (!cpu_khz) return 0; @@ -131,6 +143,8 @@ unsigned int arch_freq_get_on_cpu(int cpu) return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); + atomic_set(&s->scfpending, 1); + smp_mb(); /* ->scfpending before smp_call_function_single(). */ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); return per_cpu(samples.khz, cpu); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 57074cf3ad7c..3ca9be482a9e 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -580,7 +580,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = amd_get_nb_id(smp_processor_id()); + node = topology_die_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -646,7 +646,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -657,7 +657,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -684,7 +684,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 35ad8480c464..ab640abe26b6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); @@ -1739,8 +1742,8 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(void *, hardirq_stack_ptr); +DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d502241995a3..42af31b64c2c 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 29a3bedabd06..3b1b01f2b248 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,16 +93,41 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ +static void clear_sgx_caps(void) +{ + setup_clear_cpu_cap(X86_FEATURE_SGX); + setup_clear_cpu_cap(X86_FEATURE_SGX_LC); +} + +static int __init nosgx(char *str) +{ + clear_sgx_caps(); + + return 0; +} + +early_param("nosgx", nosgx); + void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { bool tboot = tboot_enabled(); + bool enable_sgx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); + clear_sgx_caps(); return; } + /* + * Enable SGX if and only if the kernel supports SGX and Launch Control + * is supported, i.e. disable SGX if the LE hash MSRs can't be written. + */ + enable_sgx = cpu_has(c, X86_FEATURE_SGX) && + cpu_has(c, X86_FEATURE_SGX_LC) && + IS_ENABLED(CONFIG_X86_SGX); + if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -124,13 +149,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } + if (enable_sgx) + msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + wrmsrl(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); if (!cpu_has(c, X86_FEATURE_VMX)) - return; + goto update_sgx; if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { @@ -143,4 +171,12 @@ update_caps: init_vmx_capabilities(c); #endif } + +update_sgx: + if (!(msr & FEAT_CTL_SGX_ENABLED) || + !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { + if (enable_sgx) + pr_err_once("SGX disabled by BIOS\n"); + clear_sgx_caps(); + } } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ac6c30e5801d..ae59115d18f9 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -14,9 +14,6 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> -#ifdef CONFIG_X86_64 -# include <asm/set_memory.h> -#endif #include "cpu.h" @@ -65,7 +62,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -75,7 +71,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; c->cpu_core_id = ebx & 0xff; @@ -93,14 +89,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) /* Socket ID is ApicId[6] for these processors. */ c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu, node_id); + cacheinfo_hygon_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -123,7 +119,7 @@ static void hygon_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -204,23 +200,6 @@ static void early_init_hygon_mc(struct cpuinfo_x86 *c) static void bsp_init_hygon(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 59a1e3ce3f14..0e422a544835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,6 +24,7 @@ #include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> +#include <asm/thermal.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 @@ -1159,6 +1162,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), {} }; diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0c6b02dd744c..e486f96b3cb3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1341,7 +1341,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb = node_to_amd_nb(topology_die_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1445,7 +1445,7 @@ static void threshold_remove_bank(struct threshold_bank *bank) * The last CPU on this node using the shared bank is going * away, remove that bank now. */ - nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id())); + nb = node_to_amd_nb(topology_die_id(smp_processor_id())); nb->bank4 = NULL; } diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index af8d37962586..b58b85380ddb 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu; + struct mce m; + + if (!boot_cpu_has(X86_FEATURE_SMCA)) + return -EINVAL; + + /* + * The starting address of the register array extracted from BERT must + * match with the first expected register in the register layout of + * SMCA address space. This address corresponds to banks's MCA_STATUS + * register. + * + * Match any MCi_STATUS register by turning off bank numbers. + */ + if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) != + MSR_AMD64_SMCA_MC0_STATUS) + return -EINVAL; + + /* + * The register array size must be large enough to include all the + * SMCA registers which need to be extracted. + * + * The number of registers in the register array is determined by + * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. + * The register layout is fixed and currently the raw data in the + * register array includes 6 SMCA registers which the kernel can + * extract. + */ + if (ctx_info->reg_arr_size < 48) + return -EINVAL; + + mce_setup(&m); + + m.extcpu = -1; + m.socketid = -1; + + for_each_possible_cpu(cpu) { + if (cpu_data(cpu).initial_apicid == lapic_id) { + m.extcpu = cpu; + m.socketid = cpu_data(m.extcpu).phys_proc_id; + break; + } + } + + m.apicid = lapic_id; + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; + m.status = *i_mce; + m.addr = *(i_mce + 1); + m.misc = *(i_mce + 2); + /* Skipping MCA_CONFIG */ + m.ipid = *(i_mce + 4); + m.synd = *(i_mce + 5); + + mce_log(&m); + + return 0; +} + #define CPER_CREATOR_MCE \ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ 0x64, 0x90, 0xb8, 0x9d) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 32b7099e3511..7962355436da 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log); void mce_register_decode_chain(struct notifier_block *nb) { - if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) return; blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); @@ -877,6 +878,12 @@ static atomic_t mce_executing; static atomic_t mce_callin; /* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* * Check if a timeout waiting for other CPUs happened. */ static int mce_timed_out(u64 *t, const char *msg) @@ -893,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) + if (mca_cfg.tolerant <= 1) { + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); + } cpu_missing = 1; return 1; } @@ -1005,6 +1016,7 @@ static int mce_start(int *no_way_out) * is updated before mce_callin. */ order = atomic_inc_return(&mce_callin); + cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* * Wait for everyone. @@ -1113,6 +1125,7 @@ static int mce_end(int order) reset: atomic_set(&global_nwo, 0); atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); barrier(); /* @@ -1265,14 +1278,14 @@ static void kill_me_maybe(struct callback_head *cb) } } -static void queue_task_work(struct mce *m, int kill_it) +static void queue_task_work(struct mce *m, int kill_current_task) { current->mce_addr = m->addr; current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - if (kill_it) + if (kill_current_task) current->mce_kill_me.func = kill_me_now; else current->mce_kill_me.func = kill_me_maybe; @@ -1320,10 +1333,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int no_way_out = 0; /* - * If kill_it gets set, there might be a way to recover from this + * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_it = 0; + int kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES @@ -1350,8 +1363,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - + kill_current_task = (cfg->tolerant == 3) ? 0 : 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. @@ -1368,7 +1380,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * to see it will clear it. */ if (lmce) { - if (no_way_out) + if (no_way_out && cfg->tolerant < 3) mce_panic("Fatal local machine check", &m, msg); } else { order = mce_start(&no_way_out); @@ -1387,6 +1399,9 @@ noinstr void do_machine_check(struct pt_regs *regs) if (mce_end(order) < 0) { if (!no_way_out) no_way_out = worst >= MCE_PANIC_SEVERITY; + + if (no_way_out && cfg->tolerant < 3) + mce_panic("Fatal machine check on current CPU", &m, msg); } } else { /* @@ -1403,19 +1418,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } } - /* - * If tolerant is at an insane level we drop requests to kill - * processes and continue even when there is no way out. - */ - if (cfg->tolerant == 3) - kill_it = 0; - else if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - - if (worst > 0) - irq_work_queue(&mce_irq_work); - - if (worst != MCE_AR_SEVERITY && !kill_it) + if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out; /* Fault was in user mode and we need to take some action */ @@ -1423,7 +1426,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, kill_it); + queue_task_work(&m, kill_current_task); } else { /* @@ -1441,7 +1444,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, kill_it); + queue_task_work(&m, kill_current_task); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); @@ -1583,7 +1586,7 @@ static void __mcheck_cpu_mce_banks_init(void) * __mcheck_cpu_init_clear_banks() does the final bank setup. */ b->ctl = -1ULL; - b->init = 1; + b->init = true; } } @@ -1764,7 +1767,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = 0; + mce_banks[0].init = false; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1813,11 +1816,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); return 1; - break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); return 1; - break; default: return 0; } @@ -1985,7 +1986,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - bool irq_state; + irqentry_state_t irq_state; WARN_ON_ONCE(user_mode(regs)); @@ -1997,25 +1998,26 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); - trace_hardirqs_off_finish(); + machine_check_vector(regs); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); + instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -2188,7 +2190,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); @@ -2723,6 +2724,7 @@ static void mce_reset(void) atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); } static int fake_panic_get(void *data, u64 *val) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3a44346f2276..7b360731fc2d 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -522,8 +522,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + toggle_nb_mca_mst_cpu(topology_die_id(cpu)); + cpu = get_nbc_for_node(topology_die_id(cpu)); } get_online_cpus(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index abe9fe0fb851..e309476743b7 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -509,12 +509,32 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) } } +/* + * Enable additional error logs from the integrated + * memory controller on processors that support this. + */ +static void intel_imc_init(struct cpuinfo_x86 *c) +{ + u64 error_control; + + switch (c->x86_model) { + case INTEL_FAM6_SANDYBRIDGE_X: + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + return; + error_control |= 2; + wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + break; + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); + intel_imc_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c deleted file mode 100644 index a7cd2d203ced..000000000000 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Thermal throttle event support code (such as syslog messaging and rate - * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). - * - * This allows consistent reporting of CPU thermal throttle events. - * - * Maintains a counter in /sys that keeps track of the number of thermal - * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog is rate limited). - * - * Author: Dmitriy Zavin (dmitriyz@google.com) - * - * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * Inspired by Ross Biro's and Al Borchers' counter code. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "internal.h" - -/* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) - -#define THERMAL_THROTTLING_EVENT 0 -#define POWER_LIMIT_EVENT 1 - -/** - * struct _thermal_state - Represent the current thermal event state - * @next_check: Stores the next timestamp, when it is allowed - * to log the next warning message. - * @last_interrupt_time: Stores the timestamp for the last threshold - * high event. - * @therm_work: Delayed workqueue structure - * @count: Stores the current running count for thermal - * or power threshold interrupts. - * @last_count: Stores the previous running count for thermal - * or power threshold interrupts. - * @max_time_ms: This shows the maximum amount of time CPU was - * in throttled state for a single thermal - * threshold high to low state. - * @total_time_ms: This is a cumulative time during which CPU was - * in the throttled state. - * @rate_control_active: Set when a throttling message is logged. - * This is used for the purpose of rate-control. - * @new_event: Stores the last high/low status of the - * THERM_STATUS_PROCHOT or - * THERM_STATUS_POWER_LIMIT. - * @level: Stores whether this _thermal_state instance is - * for a CORE level or for PACKAGE level. - * @sample_index: Index for storing the next sample in the buffer - * temp_samples[]. - * @sample_count: Total number of samples collected in the buffer - * temp_samples[]. - * @average: The last moving average of temperature samples - * @baseline_temp: Temperature at which thermal threshold high - * interrupt was generated. - * @temp_samples: Storage for temperature samples to calculate - * moving average. - * - * This structure is used to represent data related to thermal state for a CPU. - * There is a separate storage for core and package level for each CPU. - */ -struct _thermal_state { - u64 next_check; - u64 last_interrupt_time; - struct delayed_work therm_work; - unsigned long count; - unsigned long last_count; - unsigned long max_time_ms; - unsigned long total_time_ms; - bool rate_control_active; - bool new_event; - u8 level; - u8 sample_index; - u8 sample_count; - u8 average; - u8 baseline_temp; - u8 temp_samples[3]; -}; - -struct thermal_state { - struct _thermal_state core_throttle; - struct _thermal_state core_power_limit; - struct _thermal_state package_throttle; - struct _thermal_state package_power_limit; - struct _thermal_state core_thresh0; - struct _thermal_state core_thresh1; - struct _thermal_state pkg_thresh0; - struct _thermal_state pkg_thresh1; -}; - -/* Callback to handle core threshold interrupts */ -int (*platform_thermal_notify)(__u64 msr_val); -EXPORT_SYMBOL(platform_thermal_notify); - -/* Callback to handle core package threshold_interrupts */ -int (*platform_thermal_package_notify)(__u64 msr_val); -EXPORT_SYMBOL_GPL(platform_thermal_package_notify); - -/* Callback support of rate control, return true, if - * callback has rate control */ -bool (*platform_thermal_package_rate_control)(void); -EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); - - -static DEFINE_PER_CPU(struct thermal_state, thermal_state); - -static atomic_t therm_throt_en = ATOMIC_INIT(0); - -static u32 lvtthmr_init __read_mostly; - -#ifdef CONFIG_SYSFS -#define define_therm_throt_device_one_ro(_name) \ - static DEVICE_ATTR(_name, 0444, \ - therm_throt_device_show_##_name, \ - NULL) \ - -#define define_therm_throt_device_show_func(event, name) \ - \ -static ssize_t therm_throt_device_show_##event##_##name( \ - struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) { \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).event.name); \ - } else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ -} - -define_therm_throt_device_show_func(core_throttle, count); -define_therm_throt_device_one_ro(core_throttle_count); - -define_therm_throt_device_show_func(core_power_limit, count); -define_therm_throt_device_one_ro(core_power_limit_count); - -define_therm_throt_device_show_func(package_throttle, count); -define_therm_throt_device_one_ro(package_throttle_count); - -define_therm_throt_device_show_func(package_power_limit, count); -define_therm_throt_device_one_ro(package_power_limit_count); - -define_therm_throt_device_show_func(core_throttle, max_time_ms); -define_therm_throt_device_one_ro(core_throttle_max_time_ms); - -define_therm_throt_device_show_func(package_throttle, max_time_ms); -define_therm_throt_device_one_ro(package_throttle_max_time_ms); - -define_therm_throt_device_show_func(core_throttle, total_time_ms); -define_therm_throt_device_one_ro(core_throttle_total_time_ms); - -define_therm_throt_device_show_func(package_throttle, total_time_ms); -define_therm_throt_device_one_ro(package_throttle_total_time_ms); - -static struct attribute *thermal_throttle_attrs[] = { - &dev_attr_core_throttle_count.attr, - &dev_attr_core_throttle_max_time_ms.attr, - &dev_attr_core_throttle_total_time_ms.attr, - NULL -}; - -static const struct attribute_group thermal_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" -}; -#endif /* CONFIG_SYSFS */ - -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 - -#define THERM_THROT_POLL_INTERVAL HZ -#define THERM_STATUS_PROCHOT_LOG BIT(1) - -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) - -static void clear_therm_status_log(int level) -{ - int msr; - u64 mask, msr_val; - - if (level == CORE_LEVEL) { - msr = MSR_IA32_THERM_STATUS; - mask = THERM_STATUS_CLEAR_CORE_MASK; - } else { - msr = MSR_IA32_PACKAGE_THERM_STATUS; - mask = THERM_STATUS_CLEAR_PKG_MASK; - } - - rdmsrl(msr, msr_val); - msr_val &= mask; - wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); -} - -static void get_therm_status(int level, bool *proc_hot, u8 *temp) -{ - int msr; - u64 msr_val; - - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; - - rdmsrl(msr, msr_val); - if (msr_val & THERM_STATUS_PROCHOT_LOG) - *proc_hot = true; - else - *proc_hot = false; - - *temp = (msr_val >> 16) & 0x7F; -} - -static void __maybe_unused throttle_active_work(struct work_struct *work) -{ - struct _thermal_state *state = container_of(to_delayed_work(work), - struct _thermal_state, therm_work); - unsigned int i, avg, this_cpu = smp_processor_id(); - u64 now = get_jiffies_64(); - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* temperature value is offset from the max so lesser means hotter */ - if (!hot && temp > state->baseline_temp) { - if (state->rate_control_active) - pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - - state->rate_control_active = false; - return; - } - - if (time_before64(now, state->next_check) && - state->rate_control_active) - goto re_arm; - - state->next_check = now + CHECK_INTERVAL; - - if (state->count != state->last_count) { - /* There was one new thermal interrupt */ - state->last_count = state->count; - state->average = 0; - state->sample_count = 0; - state->sample_index = 0; - } - - state->temp_samples[state->sample_index] = temp; - state->sample_count++; - state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); - if (state->sample_count < ARRAY_SIZE(state->temp_samples)) - goto re_arm; - - avg = 0; - for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) - avg += state->temp_samples[i]; - - avg /= ARRAY_SIZE(state->temp_samples); - - if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - state->rate_control_active = true; - } - - state->average = avg; - -re_arm: - clear_therm_status_log(state->level); - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); -} - -/*** - * therm_throt_process - Process thermal throttling event from interrupt - * @curr: Whether the condition is current or not (boolean), since the - * thermal interrupt normally gets called both when the thermal - * event begins and once the event has ended. - * - * This function is called by the thermal interrupt after the - * IRQ has been acknowledged. - * - * It will take care of rate limiting and printing messages to the syslog. - */ -static void therm_throt_process(bool new_event, int event, int level) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - bool old_event; - u64 now; - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - - now = get_jiffies_64(); - if (level == CORE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->core_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->core_power_limit; - else - return; - } else if (level == PACKAGE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->package_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->package_power_limit; - else - return; - } else - return; - - old_event = state->new_event; - state->new_event = new_event; - - if (new_event) - state->count++; - - if (event != THERMAL_THROTTLING_EVENT) - return; - - if (new_event && !state->last_interrupt_time) { - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* - * Ignore short temperature spike as the system is not close - * to PROCHOT. 10C offset is large enough to ignore. It is - * already dropped from the high threshold temperature. - */ - if (temp > 10) - return; - - state->baseline_temp = temp; - state->last_interrupt_time = now; - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); - } else if (old_event && state->last_interrupt_time) { - unsigned long throttle_time; - - throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); - if (throttle_time > state->max_time_ms) - state->max_time_ms = throttle_time; - state->total_time_ms += throttle_time; - state->last_interrupt_time = 0; - } -} - -static int thresh_event_valid(int level, int event) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - u64 now = get_jiffies_64(); - - if (level == PACKAGE_LEVEL) - state = (event == 0) ? &pstate->pkg_thresh0 : - &pstate->pkg_thresh1; - else - state = (event == 0) ? &pstate->core_thresh0 : - &pstate->core_thresh1; - - if (time_before64(now, state->next_check)) - return 0; - - state->next_check = now + CHECK_INTERVAL; - - return 1; -} - -static bool int_pln_enable; -static int __init int_pln_enable_setup(char *s) -{ - int_pln_enable = true; - - return 1; -} -__setup("int_pln_enable", int_pln_enable_setup); - -#ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device: */ -static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) -{ - int err; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - err = sysfs_create_group(&dev->kobj, &thermal_attr_group); - if (err) - return err; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_core_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - - if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_max_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_total_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - } - - return 0; - -del_group: - sysfs_remove_group(&dev->kobj, &thermal_attr_group); - - return err; -} - -static void thermal_throttle_remove_dev(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &thermal_attr_group); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int thermal_throttle_online(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - state->package_throttle.level = PACKAGE_LEVEL; - state->core_throttle.level = CORE_LEVEL; - - INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); - INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); - - /* Unmask the thermal vector after the above workqueues are initialized. */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - return thermal_throttle_add_dev(dev, cpu); -} - -static int thermal_throttle_offline(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - /* Mask the thermal vector before draining evtl. pending work */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - - cancel_delayed_work_sync(&state->package_throttle.therm_work); - cancel_delayed_work_sync(&state->core_throttle.therm_work); - - state->package_throttle.rate_control_active = false; - state->core_throttle.rate_control_active = false; - - thermal_throttle_remove_dev(dev); - return 0; -} - -static __init int thermal_throttle_init_device(void) -{ - int ret; - - if (!atomic_read(&therm_throt_en)) - return 0; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", - thermal_throttle_online, - thermal_throttle_offline); - return ret < 0 ? ret : 0; -} -device_initcall(thermal_throttle_init_device); - -#endif /* CONFIG_SYSFS */ - -static void notify_package_thresholds(__u64 msr_val) -{ - bool notify_thres_0 = false; - bool notify_thres_1 = false; - - if (!platform_thermal_package_notify) - return; - - /* lower threshold check */ - if (msr_val & THERM_LOG_THRESHOLD0) - notify_thres_0 = true; - /* higher threshold check */ - if (msr_val & THERM_LOG_THRESHOLD1) - notify_thres_1 = true; - - if (!notify_thres_0 && !notify_thres_1) - return; - - if (platform_thermal_package_rate_control && - platform_thermal_package_rate_control()) { - /* Rate control is implemented in callback */ - platform_thermal_package_notify(msr_val); - return; - } - - /* lower threshold reached */ - if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) - platform_thermal_package_notify(msr_val); - /* higher threshold reached */ - if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) - platform_thermal_package_notify(msr_val); -} - -static void notify_thresholds(__u64 msr_val) -{ - /* check whether the interrupt handler is defined; - * otherwise simply return - */ - if (!platform_thermal_notify) - return; - - /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && - thresh_event_valid(CORE_LEVEL, 0)) - platform_thermal_notify(msr_val); - /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && - thresh_event_valid(CORE_LEVEL, 1)) - platform_thermal_notify(msr_val); -} - -/* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - - /* Check for violation of core thermal thresholds*/ - notify_thresholds(msr_val); - - therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PTS)) { - rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - /* check violations of package thermal thresholds */ - notify_package_thresholds(msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & - PACKAGE_THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - PACKAGE_LEVEL); - } -} - -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) -{ - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); -} - -/* Thermal monitoring depends on APIC, ACPI and clock modulation */ -static int intel_thermal_supported(struct cpuinfo_x86 *c) -{ - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return 0; - return 1; -} - -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - if (!intel_thermal_supported(c)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - h = lvtthmr_init; - /* - * The initial value of thermal LVT entries on all APs always reads - * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI - * sequence to them and LVT registers are reset to 0s except for - * the mask bits which are set to 1s when APs receive INIT IPI. - * If BIOS takes over the thermal interrupt and sets its interrupt - * delivery mode to SMI (not fixed), it restores the value that the - * BIOS has programmed on AP based on BSP's info we saved since BIOS - * is always setting the same value for all threads/cores. - */ - if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) - apic_write(APIC_LVTTHMR, lvtthmr_init); - - - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - if (system_state == SYSTEM_BOOTING) - pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - /* early Pentium M models use different method for enabling TM2 */ - if (cpu_has(c, X86_FEATURE_TM2)) { - if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { - rdmsr(MSR_THERM2_CTL, l, h); - if (l & MSR_THERM2_CTL_TM_SELECT) - tm2 = 1; - } else if (l & MSR_IA32_MISC_ENABLE_TM2) - tm2 = 1; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - (l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - if (cpu_has(c, X86_FEATURE_PTS)) { - rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - (l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE)) - & ~PACKAGE_THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE - | PACKAGE_THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); - } - - smp_thermal_vector = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 3f6b137ef4e6..3d4a48336084 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -215,7 +215,6 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); return 0; - break; } if (sh_psize > min_t(u32, buf_size, max_size)) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ec6f0415bc6d..b935e1b5f115 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -int __init microcode_init(void) +static int __init microcode_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; int error; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 05ef1f4550cb..e88bc296afca 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -31,6 +31,11 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/numa.h> + +/* Is Linux running as the root partition? */ +bool hv_root_partition; +EXPORT_SYMBOL_GPL(hv_root_partition); struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -135,14 +140,32 @@ static void hv_machine_shutdown(void) { if (kexec_in_progress && hv_kexec_handler) hv_kexec_handler(); + + /* + * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor + * corrupts the old VP Assist Pages and can crash the kexec kernel. + */ + if (kexec_in_progress && hyperv_init_cpuhp > 0) + cpuhp_remove_state(hyperv_init_cpuhp); + + /* The function calls stop_other_cpus(). */ native_machine_shutdown(); + + /* Disable the hypercall page when there is only 1 active CPU. */ + if (kexec_in_progress) + hyperv_cleanup(); } static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) hv_crash_handler(regs); + + /* The function calls crash_smp_send_stop(). */ native_machine_crash_shutdown(regs); + + /* Disable the hypercall page when there is only 1 active CPU. */ + hyperv_cleanup(); } #endif /* CONFIG_KEXEC_CORE */ #endif /* CONFIG_HYPERV */ @@ -208,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void) hv_init_spinlocks(); #endif } + +static void __init hv_smp_prepare_cpus(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_64 + int i; + int ret; +#endif + + native_smp_prepare_cpus(max_cpus); + +#ifdef CONFIG_X86_64 + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + BUG_ON(ret); + } + + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i); + BUG_ON(ret); + } +#endif +} #endif static void __init ms_hyperv_init_platform(void) @@ -225,6 +274,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); @@ -238,6 +288,22 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); /* + * Check CPU management privilege. + * + * To mirror what Windows does we should extract CPU management + * features and use the ReservedIdentityBit to detect if Linux is the + * root partition. But that requires negotiating CPU management + * interface (a process to be finalized). + * + * For now, use the privilege flag as the indicator for running as + * root. + */ + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + hv_root_partition = true; + pr_info("Hyper-V: running as root partition\n"); + } + + /* * Extract host information. */ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= @@ -259,6 +325,14 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } + if (ms_hyperv.features_b & HV_ISOLATION) { + ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", + ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + } + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); @@ -348,6 +422,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; + if (hv_root_partition) + smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif /* @@ -366,9 +442,38 @@ static void __init ms_hyperv_init_platform(void) #endif } +static bool __init ms_hyperv_x2apic_available(void) +{ + return x2apic_supported(); +} + +/* + * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() + * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the + * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). + * + * Note: for a VM on Hyper-V, the I/O-APIC is the only device which + * (logically) generates MSIs directly to the system APIC irq domain. + * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the + * pci-hyperv host bridge. + */ +static bool __init ms_hyperv_msi_ext_dest_id(void) +{ + u32 eax; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); + if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) + return false; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; +} + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .type = X86_HYPER_MS_HYPERV, + .init.x2apic_available = ms_hyperv_x2apic_available, + .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..9231640782fa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 23ad8e953dfb..b90f3f437765 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include <linux/export.h> #include <linux/init.h> @@ -167,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, *repeat = 0; *uniform = 1; - /* Make end inclusive instead of exclusive */ - end--; - prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -261,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) int repeat; u64 partial_end; + /* Make end inclusive instead of exclusive */ + end--; + if (!mtrr_state_set) return MTRR_TYPE_INVALID; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 6a80f36b5d59..28c8a23aa42e 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/stop_machine.h> @@ -794,8 +792,6 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; - rcu_cpu_starting(smp_processor_id()); - /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, @@ -813,7 +809,8 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + * mtrr_save_state - Save current fixed-range MTRR state of the first + * cpu in cpu_online_mask. */ void mtrr_save_state(void) { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index a5ee607a3b89..3ef5868ac588 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -3,7 +3,7 @@ * local apic based NMI watchdog for various CPUs. * * This file also handles reservation of performance counters for coordination - * with other users (like oprofile). + * with other users. * * Note that these events normally don't tick when the CPU idles. This means * the frequency varies with CPU load. @@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) } -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); - int reserve_perfctr_nmi(unsigned int msr) { unsigned int counter; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e8b5f1cf1ae8..698bb26aeb6e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -895,6 +895,10 @@ static __init void __check_quirks_intel(void) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); + fallthrough; + case INTEL_FAM6_BROADWELL_X: + intel_rdt_mbm_apply_quirk(); + break; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index f65d3c0dbc41..c4d320d02fd5 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -264,7 +264,7 @@ void __exit rdtgroup_exit(void); struct rftype { char *name; umode_t mode; - struct kernfs_ops *kf_ops; + const struct kernfs_ops *kf_ops; unsigned long flags; unsigned long fflags; @@ -572,6 +572,7 @@ union cpuid_0x10_x_edx { void rdt_last_cmd_clear(void); void rdt_last_cmd_puts(const char *s); +__printf(1, 2) void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); @@ -619,6 +620,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); +void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a98519a3a2e6..7ac31210e452 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -64,6 +64,69 @@ unsigned int rdt_mon_features; */ unsigned int resctrl_cqm_threshold; +#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) + +/* + * The correction factor table is documented in Documentation/x86/resctrl.rst. + * If rmid > rmid threshold, MBM total and local values should be multiplied + * by the correction factor. + * + * The original table is modified for better code: + * + * 1. The threshold 0 is changed to rmid count - 1 so don't do correction + * for the case. + * 2. MBM total and local correction table indexed by core counter which is + * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. + * 3. The correction factor is normalized to 2^20 (1048576) so it's faster + * to calculate corrected value by shifting: + * corrected_value = (original_value * correction_factor) >> 20 + */ +static const struct mbm_correction_factor_table { + u32 rmidthreshold; + u64 cf; +} mbm_cf_table[] __initdata = { + {7, CF(1.000000)}, + {15, CF(1.000000)}, + {15, CF(0.969650)}, + {31, CF(1.000000)}, + {31, CF(1.066667)}, + {31, CF(0.969650)}, + {47, CF(1.142857)}, + {63, CF(1.000000)}, + {63, CF(1.185115)}, + {63, CF(1.066553)}, + {79, CF(1.454545)}, + {95, CF(1.000000)}, + {95, CF(1.230769)}, + {95, CF(1.142857)}, + {95, CF(1.066667)}, + {127, CF(1.000000)}, + {127, CF(1.254863)}, + {127, CF(1.185255)}, + {151, CF(1.000000)}, + {127, CF(1.066667)}, + {167, CF(1.000000)}, + {159, CF(1.454334)}, + {183, CF(1.000000)}, + {127, CF(0.969744)}, + {191, CF(1.280246)}, + {191, CF(1.230921)}, + {215, CF(1.000000)}, + {191, CF(1.143118)}, +}; + +static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; +static u64 mbm_cf __read_mostly; + +static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) +{ + /* Correct MBM value. */ + if (rmid > mbm_cf_rmidthreshold) + val = (val * mbm_cf) >> 20; + + return val; +} + static inline struct rmid_entry *__rmid_entry(u32 rmid) { struct rmid_entry *entry; @@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) m->chunks += chunks; m->prev_msr = tval; - rr->val += m->chunks; + rr->val += get_corrected_mbm_count(rmid, m->chunks); + return 0; } @@ -279,7 +343,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - cur_bw = (chunks * r->mon_scale) >> 20; + cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); @@ -642,3 +706,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } + +void __init intel_rdt_mbm_apply_quirk(void) +{ + int cf_index; + + cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; + if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { + pr_info("No MBM correction factor available\n"); + return; + } + + mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; + mbm_cf = mbm_cf_table[cf_index].cf; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 0daf2f1cf7a8..e916646adc69 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f3418428682b..f9190adc52cb 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, return -EINVAL; } -static struct kernfs_ops rdtgroup_kf_single_ops = { +static const struct kernfs_ops rdtgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .write = rdtgroup_file_write, .seq_show = rdtgroup_seqfile_show, }; -static struct kernfs_ops kf_mondata_ops = { +static const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, }; @@ -525,89 +525,70 @@ static void rdtgroup_remove(struct rdtgroup *rdtgrp) kfree(rdtgrp); } -struct task_move_callback { - struct callback_head work; - struct rdtgroup *rdtgrp; -}; - -static void move_myself(struct callback_head *head) +static void _update_task_closid_rmid(void *task) { - struct task_move_callback *callback; - struct rdtgroup *rdtgrp; - - callback = container_of(head, struct task_move_callback, work); - rdtgrp = callback->rdtgrp; - /* - * If resource group was deleted before this task work callback - * was invoked, then assign the task to root group and free the - * resource group. + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. */ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - current->closid = 0; - current->rmid = 0; - rdtgroup_remove(rdtgrp); - } - - if (unlikely(current->flags & PF_EXITING)) - goto out; - - preempt_disable(); - /* update PQR_ASSOC MSR to make resource group go into effect */ - resctrl_sched_in(); - preempt_enable(); + if (task == current) + resctrl_sched_in(); +} -out: - kfree(callback); +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); } static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { - struct task_move_callback *callback; - int ret; - - callback = kzalloc(sizeof(*callback), GFP_KERNEL); - if (!callback) - return -ENOMEM; - callback->work.func = move_myself; - callback->rdtgrp = rdtgrp; + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; /* - * Take a refcount, so rdtgrp cannot be freed before the - * callback has been invoked. + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. */ - atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, TWA_RESUME); - if (ret) { - /* - * Task is exiting. Drop the refcount and free the callback. - * No need to check the refcount as the group cannot be - * deleted before the write function unlocks rdtgroup_mutex. - */ - atomic_dec(&rdtgrp->waitcount); - kfree(callback); - rdt_last_cmd_puts("Task exited\n"); - } else { - /* - * For ctrl_mon groups move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - ret = -EINVAL; - } + + if (rdtgrp->type == RDTCTRL_GROUP) { + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } } - return ret; + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + */ + barrier(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; } static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) @@ -2329,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); @@ -3023,8 +3000,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, return -EPERM; } -static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { struct rdtgroup *prdtgrp = rdtgrp->mon.parent; int cpu; @@ -3056,8 +3032,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, return 0; } -static int rdtgroup_ctrl_remove(struct kernfs_node *kn, - struct rdtgroup *rdtgrp) +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) { rdtgrp->flags = RDT_DELETED; list_del(&rdtgrp->rdtgroup_list); @@ -3066,8 +3041,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn, return 0; } -static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { int cpu; @@ -3094,7 +3068,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); - rdtgroup_ctrl_remove(kn, rdtgrp); + rdtgroup_ctrl_remove(rdtgrp); /* * Free all the child monitor group rmids. @@ -3131,13 +3105,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(kn, rdtgrp); + ret = rdtgroup_ctrl_remove(rdtgrp); } else { - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); } } else if (rdtgrp->type == RDTMON_GROUP && is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); } else { ret = -EPERM; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 866c9a9bcdee..972ec3bfa9c0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,10 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile new file mode 100644 index 000000000000..91d3dc784a29 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -0,0 +1,5 @@ +obj-y += \ + driver.o \ + encl.o \ + ioctl.o \ + main.o diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h new file mode 100644 index 000000000000..dd7602c44c72 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/arch.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains data structures defined by the SGX architecture. Data structures + * defined by the Linux software stack should not be placed here. + */ +#ifndef _ASM_X86_SGX_ARCH_H +#define _ASM_X86_SGX_ARCH_H + +#include <linux/bits.h> +#include <linux/types.h> + +/* The SGX specific CPUID function. */ +#define SGX_CPUID 0x12 +/* EPC enumeration. */ +#define SGX_CPUID_EPC 2 +/* An invalid EPC section, i.e. the end marker. */ +#define SGX_CPUID_EPC_INVALID 0x0 +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +/* The bitmask for the EPC section type. */ +#define SGX_CPUID_EPC_MASK GENMASK(3, 0) + +/** + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * been completed yet. + * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + */ +enum sgx_return_code { + SGX_NOT_TRACKED = 11, + SGX_INVALID_EINITTOKEN = 16, + SGX_UNMASKED_EVENT = 128, +}; + +/* The modulus size for 3072-bit RSA keys. */ +#define SGX_MODULUS_SIZE 384 + +/** + * enum sgx_miscselect - additional information to an SSA frame + * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * + * Save State Area (SSA) is a stack inside the enclave used to store processor + * state when an exception or interrupt occurs. This enum defines additional + * information stored to an SSA frame. + */ +enum sgx_miscselect { + SGX_MISC_EXINFO = BIT(0), +}; + +#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) + +#define SGX_SSA_GPRS_SIZE 184 +#define SGX_SSA_MISC_EXINFO_SIZE 16 + +/** + * enum sgx_attributes - the attributes field in &struct sgx_secs + * %SGX_ATTR_INIT: Enclave can be entered (is initialized). + * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * attestation. + * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * sign cryptographic tokens that can be passed to + * EINIT as an authorization to run an enclave. + */ +enum sgx_attribute { + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + SGX_ATTR_KSS = BIT(7), +}; + +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) + +/** + * struct sgx_secs - SGX Enclave Control Structure (SECS) + * @size: size of the address space + * @base: base address of the address space + * @ssa_frame_size: size of an SSA frame + * @miscselect: additional information stored to an SSA frame + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @mrenclave: SHA256-hash of the enclave contents + * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT + * @config_id: a user-defined value that is used in key derivation + * @isv_prod_id: a user-defined value that is used in key derivation + * @isv_svn: a user-defined value that is used in key derivation + * @config_svn: a user-defined value that is used in key derivation + * + * SGX Enclave Control Structure (SECS) is a special enclave page that is not + * visible in the address space. In fact, this structure defines the address + * range and other global attributes for the enclave and it is the first EPC + * page created for any enclave. It is moved from a temporary buffer to an EPC + * by the means of ENCLS[ECREATE] function. + */ +struct sgx_secs { + u64 size; + u64 base; + u32 ssa_frame_size; + u32 miscselect; + u8 reserved1[24]; + u64 attributes; + u64 xfrm; + u32 mrenclave[8]; + u8 reserved2[32]; + u32 mrsigner[8]; + u8 reserved3[32]; + u32 config_id[16]; + u16 isv_prod_id; + u16 isv_svn; + u16 config_svn; + u8 reserved4[3834]; +} __packed; + +/** + * enum sgx_tcs_flags - execution flags for TCS + * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * inside an enclave. It is cleared by EADD but can + * be set later with EDBGWR. + */ +enum sgx_tcs_flags { + SGX_TCS_DBGOPTIN = 0x01, +}; + +#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) +#define SGX_TCS_RESERVED_SIZE 4024 + +/** + * struct sgx_tcs - Thread Control Structure (TCS) + * @state: used to mark an entered TCS + * @flags: execution flags (cleared by EADD) + * @ssa_offset: SSA stack offset relative to the enclave base + * @ssa_index: the current SSA frame index (cleard by EADD) + * @nr_ssa_frames: the number of frame in the SSA stack + * @entry_offset: entry point offset relative to the enclave base + * @exit_addr: address outside the enclave to exit on an exception or + * interrupt + * @fs_offset: offset relative to the enclave base to become FS + * segment inside the enclave + * @gs_offset: offset relative to the enclave base to become GS + * segment inside the enclave + * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) + * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) + * + * Thread Control Structure (TCS) is an enclave page visible in its address + * space that defines an entry point inside the enclave. A thread enters inside + * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered + * by only one thread at a time. + */ +struct sgx_tcs { + u64 state; + u64 flags; + u64 ssa_offset; + u32 ssa_index; + u32 nr_ssa_frames; + u64 entry_offset; + u64 exit_addr; + u64 fs_offset; + u64 gs_offset; + u32 fs_limit; + u32 gs_limit; + u8 reserved[SGX_TCS_RESERVED_SIZE]; +} __packed; + +/** + * struct sgx_pageinfo - an enclave page descriptor + * @addr: address of the enclave page + * @contents: pointer to the page contents + * @metadata: pointer either to a SECINFO or PCMD instance + * @secs: address of the SECS page + */ +struct sgx_pageinfo { + u64 addr; + u64 contents; + u64 metadata; + u64 secs; +} __packed __aligned(32); + + +/** + * enum sgx_page_type - bits in the SECINFO flags defining the page type + * %SGX_PAGE_TYPE_SECS: a SECS page + * %SGX_PAGE_TYPE_TCS: a TCS page + * %SGX_PAGE_TYPE_REG: a regular page + * %SGX_PAGE_TYPE_VA: a VA page + * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + */ +enum sgx_page_type { + SGX_PAGE_TYPE_SECS, + SGX_PAGE_TYPE_TCS, + SGX_PAGE_TYPE_REG, + SGX_PAGE_TYPE_VA, + SGX_PAGE_TYPE_TRIM, +}; + +#define SGX_NR_PAGE_TYPES 5 +#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) + +/** + * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo + * %SGX_SECINFO_R: allow read + * %SGX_SECINFO_W: allow write + * %SGX_SECINFO_X: allow execution + * %SGX_SECINFO_SECS: a SECS page + * %SGX_SECINFO_TCS: a TCS page + * %SGX_SECINFO_REG: a regular page + * %SGX_SECINFO_VA: a VA page + * %SGX_SECINFO_TRIM: a page in trimmed state + */ +enum sgx_secinfo_flags { + SGX_SECINFO_R = BIT(0), + SGX_SECINFO_W = BIT(1), + SGX_SECINFO_X = BIT(2), + SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), + SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), + SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), + SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), + SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), +}; + +#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) +#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) +#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ + SGX_SECINFO_PAGE_TYPE_MASK) + +/** + * struct sgx_secinfo - describes attributes of an EPC page + * @flags: permissions and type + * + * Used together with ENCLS leaves that add or modify an EPC page to an + * enclave to define page permissions and type. + */ +struct sgx_secinfo { + u64 flags; + u8 reserved[56]; +} __packed __aligned(64); + +#define SGX_PCMD_RESERVED_SIZE 40 + +/** + * struct sgx_pcmd - Paging Crypto Metadata (PCMD) + * @enclave_id: enclave identifier + * @mac: MAC over PCMD, page contents and isvsvn + * + * PCMD is stored for every swapped page to the regular memory. When ELDU loads + * the page back it recalculates the MAC by using a isvsvn number stored in a + * VA page. Together these two structures bring integrity and rollback + * protection. + */ +struct sgx_pcmd { + struct sgx_secinfo secinfo; + u64 enclave_id; + u8 reserved[SGX_PCMD_RESERVED_SIZE]; + u8 mac[16]; +} __packed __aligned(128); + +#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 +#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 +#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 +#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 + +/** + * struct sgx_sigstruct_header - defines author of the enclave + * @header1: constant byte string + * @vendor: must be either 0x0000 or 0x8086 + * @date: YYYYMMDD in BCD + * @header2: costant byte string + * @swdefined: software defined value + */ +struct sgx_sigstruct_header { + u64 header1[2]; + u32 vendor; + u32 date; + u64 header2[2]; + u32 swdefined; + u8 reserved1[84]; +} __packed; + +/** + * struct sgx_sigstruct_body - defines contents of the enclave + * @miscselect: additional information stored to an SSA frame + * @misc_mask: required miscselect in SECS + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @attributes_mask: required attributes in SECS + * @xfrm_mask: required XFRM in SECS + * @mrenclave: SHA256-hash of the enclave contents + * @isvprodid: a user-defined value that is used in key derivation + * @isvsvn: a user-defined value that is used in key derivation + */ +struct sgx_sigstruct_body { + u32 miscselect; + u32 misc_mask; + u8 reserved2[20]; + u64 attributes; + u64 xfrm; + u64 attributes_mask; + u64 xfrm_mask; + u8 mrenclave[32]; + u8 reserved3[32]; + u16 isvprodid; + u16 isvsvn; +} __packed; + +/** + * struct sgx_sigstruct - an enclave signature + * @header: defines author of the enclave + * @modulus: the modulus of the public key + * @exponent: the exponent of the public key + * @signature: the signature calculated over the fields except modulus, + * @body: defines contents of the enclave + * @q1: a value used in RSA signature verification + * @q2: a value used in RSA signature verification + * + * Header and body are the parts that are actual signed. The remaining fields + * define the signature of the enclave. + */ +struct sgx_sigstruct { + struct sgx_sigstruct_header header; + u8 modulus[SGX_MODULUS_SIZE]; + u32 exponent; + u8 signature[SGX_MODULUS_SIZE]; + struct sgx_sigstruct_body body; + u8 reserved4[12]; + u8 q1[SGX_MODULUS_SIZE]; + u8 q2[SGX_MODULUS_SIZE]; +} __packed; + +#define SGX_LAUNCH_TOKEN_SIZE 304 + +#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c new file mode 100644 index 000000000000..8ce6d8371cfb --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/acpi.h> +#include <linux/miscdevice.h> +#include <linux/mman.h> +#include <linux/security.h> +#include <linux/suspend.h> +#include <asm/traps.h> +#include "driver.h" +#include "encl.h" + +u64 sgx_attributes_reserved_mask; +u64 sgx_xfrm_reserved_mask = ~0x3; +u32 sgx_misc_reserved_mask; + +static int sgx_open(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl; + int ret; + + encl = kzalloc(sizeof(*encl), GFP_KERNEL); + if (!encl) + return -ENOMEM; + + kref_init(&encl->refcount); + xa_init(&encl->page_array); + mutex_init(&encl->lock); + INIT_LIST_HEAD(&encl->va_pages); + INIT_LIST_HEAD(&encl->mm_list); + spin_lock_init(&encl->mm_lock); + + ret = init_srcu_struct(&encl->srcu); + if (ret) { + kfree(encl); + return ret; + } + + file->private_data = encl; + + return 0; +} + +static int sgx_release(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl = file->private_data; + struct sgx_encl_mm *encl_mm; + + /* + * Drain the remaining mm_list entries. At this point the list contains + * entries for processes, which have closed the enclave file but have + * not exited yet. The processes, which have exited, are gone from the + * list by sgx_mmu_notifier_release(). + */ + for ( ; ; ) { + spin_lock(&encl->mm_lock); + + if (list_empty(&encl->mm_list)) { + encl_mm = NULL; + } else { + encl_mm = list_first_entry(&encl->mm_list, + struct sgx_encl_mm, list); + list_del_rcu(&encl_mm->list); + } + + spin_unlock(&encl->mm_lock); + + /* The enclave is no longer mapped by any mm. */ + if (!encl_mm) + break; + + synchronize_srcu(&encl->srcu); + mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); + kfree(encl_mm); + + /* 'encl_mm' is gone, put encl_mm->encl reference: */ + kref_put(&encl->refcount, sgx_encl_release); + } + + kref_put(&encl->refcount, sgx_encl_release); + return 0; +} + +static int sgx_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_encl *encl = file->private_data; + int ret; + + ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags); + if (ret) + return ret; + + ret = sgx_encl_mm_add(encl, vma->vm_mm); + if (ret) + return ret; + + vma->vm_ops = &sgx_vm_ops; + vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vma->vm_private_data = encl; + + return 0; +} + +static unsigned long sgx_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if ((flags & MAP_TYPE) == MAP_PRIVATE) + return -EINVAL; + + if (flags & MAP_FIXED) + return addr; + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +#ifdef CONFIG_COMPAT +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + return sgx_ioctl(filep, cmd, arg); +} +#endif + +static const struct file_operations sgx_encl_fops = { + .owner = THIS_MODULE, + .open = sgx_open, + .release = sgx_release, + .unlocked_ioctl = sgx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sgx_compat_ioctl, +#endif + .mmap = sgx_mmap, + .get_unmapped_area = sgx_get_unmapped_area, +}; + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_enclave = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_enclave", + .nodename = "sgx_enclave", + .fops = &sgx_encl_fops, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +int __init sgx_drv_init(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 attr_mask; + u64 xfrm_mask; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + return -ENODEV; + + cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if (!(eax & 1)) { + pr_err("SGX disabled: SGX1 instruction support not available.\n"); + return -ENODEV; + } + + sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; + + cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx); + + attr_mask = (((u64)ebx) << 32) + (u64)eax; + sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK; + + if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + xfrm_mask = (((u64)edx) << 32) + (u64)ecx; + sgx_xfrm_reserved_mask = ~xfrm_mask; + } + + ret = misc_register(&sgx_dev_enclave); + if (ret) + return ret; + + ret = misc_register(&sgx_dev_provision); + if (ret) { + misc_deregister(&sgx_dev_enclave); + return ret; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h new file mode 100644 index 000000000000..4eddb4d571ef --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_SGX_DRIVER_H__ +#define __ARCH_SGX_DRIVER_H__ + +#include <crypto/hash.h> +#include <linux/kref.h> +#include <linux/mmu_notifier.h> +#include <linux/radix-tree.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <uapi/asm/sgx.h> +#include "sgx.h" + +#define SGX_EINIT_SPIN_COUNT 20 +#define SGX_EINIT_SLEEP_COUNT 50 +#define SGX_EINIT_SLEEP_TIME 20 + +extern u64 sgx_attributes_reserved_mask; +extern u64 sgx_xfrm_reserved_mask; +extern u32 sgx_misc_reserved_mask; + +extern const struct file_operations sgx_provision_fops; + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +int sgx_drv_init(void); + +#endif /* __ARCH_X86_SGX_DRIVER_H__ */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c new file mode 100644 index 000000000000..7449ef33f081 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/lockdep.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/shmem_fs.h> +#include <linux/suspend.h> +#include <linux/sched/mm.h> +#include "arch.h" +#include "encl.h" +#include "encls.h" +#include "sgx.h" + +/* + * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC + * Pages" in the SDM. + */ +static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_epc_page *secs_page) +{ + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_pageinfo pginfo; + struct sgx_backing b; + pgoff_t page_index; + int ret; + + if (secs_page) + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + else + page_index = PFN_DOWN(encl->size); + + ret = sgx_encl_get_backing(encl, page_index, &b); + if (ret) + return ret; + + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.contents = (unsigned long)kmap_atomic(b.contents); + pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + + b.pcmd_offset; + + if (secs_page) + pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); + else + pginfo.secs = 0; + + ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), + sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ELDU"); + + ret = -EFAULT; + } + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + sgx_encl_put_backing(&b, false); + + return ret; +} + +static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *secs_page) +{ + + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) + return epc_page; + + ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); + if (ret) { + sgx_free_epc_page(epc_page); + return ERR_PTR(ret); + } + + sgx_free_va_slot(encl_page->va_page, va_offset); + list_move(&encl_page->va_page->list, &encl->va_pages); + encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; + encl_page->epc_page = epc_page; + + return epc_page; +} + +static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_epc_page *epc_page; + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the faulted page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + /* Entry successfully located. */ + if (entry->epc_page) { + if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) + return ERR_PTR(-EBUSY); + + return entry; + } + + if (!(encl->secs.epc_page)) { + epc_page = sgx_encl_eldu(&encl->secs, NULL); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + } + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + encl->secs_child_cnt++; + sgx_mark_page_reclaimable(entry->epc_page); + + return entry; +} + +static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) +{ + unsigned long addr = (unsigned long)vmf->address; + struct vm_area_struct *vma = vmf->vma; + struct sgx_encl_page *entry; + unsigned long phys_addr; + struct sgx_encl *encl; + vm_fault_t ret; + + encl = vma->vm_private_data; + + /* + * It's very unlikely but possible that allocating memory for the + * mm_list entry of a forked process failed in sgx_vma_open(). When + * this happens, vm_private_data is set to NULL. + */ + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + + if (PTR_ERR(entry) == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; + } + + phys_addr = sgx_get_epc_phys_addr(entry->epc_page); + + ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (ret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + + return VM_FAULT_SIGBUS; + } + + sgx_encl_test_and_clear_young(vma->vm_mm, entry); + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; +} + +static void sgx_vma_open(struct vm_area_struct *vma) +{ + struct sgx_encl *encl = vma->vm_private_data; + + /* + * It's possible but unlikely that vm_private_data is NULL. This can + * happen in a grandchild of a process, when sgx_encl_mm_add() had + * failed to allocate memory in this callback. + */ + if (unlikely(!encl)) + return; + + if (sgx_encl_mm_add(encl, vma->vm_mm)) + vma->vm_private_data = NULL; +} + + +/** + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed + * @encl: an enclave pointer + * @start: lower bound of the address range, inclusive + * @end: upper bound of the address range, exclusive + * @vm_flags: VMA flags + * + * Iterate through the enclave pages contained within [@start, @end) to verify + * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} + * do not contain any permissions that are not contained in the build time + * permissions of any of the enclave pages within the given address range. + * + * An enclave creator must declare the strongest permissions that will be + * needed for each enclave page. This ensures that mappings have the identical + * or weaker permissions than the earlier declared permissions. + * + * Return: 0 on success, -EACCES otherwise + */ +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *page; + unsigned long count = 0; + int ret = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. + */ + if (current->personality & READ_IMPLIES_EXEC) + return -EACCES; + + mutex_lock(&encl->lock); + xas_lock(&xas); + xas_for_each(&xas, page, PFN_DOWN(end - 1)) { + if (~page->vm_max_prot_bits & vm_prot_bits) { + ret = -EACCES; + break; + } + + /* Reschedule on every XA_CHECK_SCHED iteration. */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + cond_resched(); + + mutex_lock(&encl->lock); + xas_lock(&xas); + } + } + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + return ret; +} + +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); +} + +static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + + ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +/* + * Load an enclave page to EPC if required, and take encl->lock. + */ +static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + struct sgx_encl_page *entry; + + for ( ; ; ) { + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + + mutex_unlock(&encl->lock); + } + + if (IS_ERR(entry)) + mutex_unlock(&encl->lock); + + return entry; +} + +static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct sgx_encl *encl = vma->vm_private_data; + struct sgx_encl_page *entry = NULL; + char data[sizeof(unsigned long)]; + unsigned long align; + int offset; + int cnt; + int ret = 0; + int i; + + /* + * If process was forked, VMA is still there but vm_private_data is set + * to NULL. + */ + if (!encl) + return -EFAULT; + + if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) + return -EFAULT; + + for (i = 0; i < len; i += cnt) { + entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, + vma->vm_flags); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + break; + } + + align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); + offset = (addr + i) & (sizeof(unsigned long) - 1); + cnt = sizeof(unsigned long) - offset; + cnt = min(cnt, len - i); + + ret = sgx_encl_debug_read(encl, entry, align, data); + if (ret) + goto out; + + if (write) { + memcpy(data + offset, buf + i, cnt); + ret = sgx_encl_debug_write(encl, entry, align, data); + if (ret) + goto out; + } else { + memcpy(buf + i, data + offset, cnt); + } + +out: + mutex_unlock(&encl->lock); + + if (ret) + break; + } + + return ret < 0 ? ret : i; +} + +const struct vm_operations_struct sgx_vm_ops = { + .fault = sgx_vma_fault, + .mprotect = sgx_vma_mprotect, + .open = sgx_vma_open, + .access = sgx_vma_access, +}; + +/** + * sgx_encl_release - Destroy an enclave instance + * @kref: address of a kref inside &sgx_encl + * + * Used together with kref_put(). Frees all the resources associated with the + * enclave and the instance itself. + */ +void sgx_encl_release(struct kref *ref) +{ + struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + struct sgx_va_page *va_page; + struct sgx_encl_page *entry; + unsigned long index; + + xa_for_each(&encl->page_array, index, entry) { + if (entry->epc_page) { + /* + * The page and its radix tree entry cannot be freed + * if the page is being held by the reclaimer. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) + continue; + + sgx_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + } + + kfree(entry); + } + + xa_destroy(&encl->page_array); + + if (!encl->secs_child_cnt && encl->secs.epc_page) { + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + } + + while (!list_empty(&encl->va_pages)) { + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + list_del(&va_page->list); + sgx_free_epc_page(va_page->epc_page); + kfree(va_page); + } + + if (encl->backing) + fput(encl->backing); + + cleanup_srcu_struct(&encl->srcu); + + WARN_ON_ONCE(!list_empty(&encl->mm_list)); + + /* Detect EPC page leak's. */ + WARN_ON_ONCE(encl->secs_child_cnt); + WARN_ON_ONCE(encl->secs.epc_page); + + kfree(encl); +} + +/* + * 'mm' is exiting and no longer needs mmu notifications. + */ +static void sgx_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + struct sgx_encl_mm *tmp = NULL; + + /* + * The enclave itself can remove encl_mm. Note, objects can't be moved + * off an RCU protected list, but deletion is ok. + */ + spin_lock(&encl_mm->encl->mm_lock); + list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { + if (tmp == encl_mm) { + list_del_rcu(&encl_mm->list); + break; + } + } + spin_unlock(&encl_mm->encl->mm_lock); + + if (tmp == encl_mm) { + synchronize_srcu(&encl_mm->encl->srcu); + mmu_notifier_put(mn); + } +} + +static void sgx_mmu_notifier_free(struct mmu_notifier *mn) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + + /* 'encl_mm' is going away, put encl_mm->encl reference: */ + kref_put(&encl_mm->encl->refcount, sgx_encl_release); + + kfree(encl_mm); +} + +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { + .release = sgx_mmu_notifier_release, + .free_notifier = sgx_mmu_notifier_free, +}; + +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = NULL; + struct sgx_encl_mm *tmp; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(tmp, &encl->mm_list, list) { + if (tmp->mm == mm) { + encl_mm = tmp; + break; + } + } + + srcu_read_unlock(&encl->srcu, idx); + + return encl_mm; +} + +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm; + int ret; + + /* + * Even though a single enclave may be mapped into an mm more than once, + * each 'mm' only appears once on encl->mm_list. This is guaranteed by + * holding the mm's mmap lock for write before an mm can be added or + * remove to an encl->mm_list. + */ + mmap_assert_write_locked(mm); + + /* + * It's possible that an entry already exists in the mm_list, because it + * is removed only on VFS release or process exit. + */ + if (sgx_encl_find_mm(encl, mm)) + return 0; + + encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); + if (!encl_mm) + return -ENOMEM; + + /* Grab a refcount for the encl_mm->encl reference: */ + kref_get(&encl->refcount); + encl_mm->encl = encl; + encl_mm->mm = mm; + encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; + + ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); + if (ret) { + kfree(encl_mm); + return ret; + } + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); + /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); + + return 0; +} + +static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) +{ + struct inode *inode = encl->backing->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + gfp_t gfpmask = mapping_gfp_mask(mapping); + + return shmem_read_mapping_page_gfp(mapping, index, gfpmask); +} + +/** + * sgx_encl_get_backing() - Pin the backing storage + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Pin the backing storage pages for storing the encrypted contents and Paging + * Crypto MetaData (PCMD) of an enclave page. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + struct page *contents; + struct page *pcmd; + + contents = sgx_encl_get_backing_page(encl, page_index); + if (IS_ERR(contents)) + return PTR_ERR(contents); + + pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + if (IS_ERR(pcmd)) { + put_page(contents); + return PTR_ERR(pcmd); + } + + backing->page_index = page_index; + backing->contents = contents; + backing->pcmd = pcmd; + backing->pcmd_offset = + (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * + sizeof(struct sgx_pcmd); + + return 0; +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + * @do_write: mark pages dirty + */ +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +{ + if (do_write) { + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); + } + + put_page(backing->pcmd); + put_page(backing->contents); +} + +static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, + void *data) +{ + pte_t pte; + int ret; + + ret = pte_young(*ptep); + if (ret) { + pte = pte_mkold(*ptep); + set_pte_at((struct mm_struct *)data, addr, ptep, pte); + } + + return ret; +} + +/** + * sgx_encl_test_and_clear_young() - Test and reset the accessed bit + * @mm: mm_struct that is checked + * @page: enclave page to be tested for recent access + * + * Checks the Access (A) bit from the PTE corresponding to the enclave page and + * clears it. + * + * Return: 1 if the page has been recently accessed and 0 if not. + */ +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page) +{ + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + struct vm_area_struct *vma; + int ret; + + ret = sgx_encl_find(mm, addr, &vma); + if (ret) + return 0; + + if (encl != vma->vm_private_data) + return 0; + + ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, + sgx_encl_test_and_clear_young_cb, vma->vm_mm); + if (ret < 0) + return 0; + + return ret; +} + +/** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * + * Return: + * a VA page, + * -errno otherwise + */ +struct sgx_epc_page *sgx_alloc_va_page(void) +{ + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(NULL, true); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + ret = __epa(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); + sgx_free_epc_page(epc_page); + return ERR_PTR(-EFAULT); + } + + return epc_page; +} + +/** + * sgx_alloc_va_slot - allocate a VA slot + * @va_page: a &struct sgx_va_page instance + * + * Allocates a slot from a &struct sgx_va_page instance. + * + * Return: offset of the slot inside the VA page + */ +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + if (slot < SGX_VA_SLOT_COUNT) + set_bit(slot, va_page->slots); + + return slot << 3; +} + +/** + * sgx_free_va_slot - free a VA slot + * @va_page: a &struct sgx_va_page instance + * @offset: offset of the slot inside the VA page + * + * Frees a slot from a &struct sgx_va_page instance. + */ +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) +{ + clear_bit(offset >> 3, va_page->slots); +} + +/** + * sgx_va_page_full - is the VA page full? + * @va_page: a &struct sgx_va_page instance + * + * Return: true if all slots have been taken + */ +bool sgx_va_page_full(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + return slot == SGX_VA_SLOT_COUNT; +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h new file mode 100644 index 000000000000..d8d30ccbef4c --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains the software defined data structures for enclaves. + */ +#ifndef _X86_ENCL_H +#define _X86_ENCL_H + +#include <linux/cpumask.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mm_types.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/srcu.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include "sgx.h" + +/* 'desc' bits holding the offset in the VA (version array) page. */ +#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3) + +/* 'desc' bit marking that the page is being reclaimed. */ +#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3) + +struct sgx_encl_page { + unsigned long desc; + unsigned long vm_max_prot_bits; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +}; + +enum sgx_encl_flags { + SGX_ENCL_IOCTL = BIT(0), + SGX_ENCL_DEBUG = BIT(1), + SGX_ENCL_CREATED = BIT(2), + SGX_ENCL_INITIALIZED = BIT(3), +}; + +struct sgx_encl_mm { + struct sgx_encl *encl; + struct mm_struct *mm; + struct list_head list; + struct mmu_notifier mmu_notifier; +}; + +struct sgx_encl { + unsigned long base; + unsigned long size; + unsigned long flags; + unsigned int page_cnt; + unsigned int secs_child_cnt; + struct mutex lock; + struct xarray page_array; + struct sgx_encl_page secs; + unsigned long attributes; + unsigned long attributes_mask; + + cpumask_t cpumask; + struct file *backing; + struct kref refcount; + struct list_head va_pages; + unsigned long mm_list_version; + struct list_head mm_list; + spinlock_t mm_lock; + struct srcu_struct srcu; +}; + +#define SGX_VA_SLOT_COUNT 512 + +struct sgx_va_page { + struct sgx_epc_page *epc_page; + DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT); + struct list_head list; +}; + +struct sgx_backing { + pgoff_t page_index; + struct page *contents; + struct page *pcmd; + unsigned long pcmd_offset; +}; + +extern const struct vm_operations_struct sgx_vm_ops; + +static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **vma) +{ + struct vm_area_struct *result; + + result = find_vma(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + return -EINVAL; + + *vma = result; + + return 0; +} + +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags); + +void sgx_encl_release(struct kref *ref); +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); + +struct sgx_epc_page *sgx_alloc_va_page(void); +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +bool sgx_va_page_full(struct sgx_va_page *va_page); + +#endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h new file mode 100644 index 000000000000..443188fe7e70 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENCLS_H +#define _X86_ENCLS_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include <asm/traps.h> +#include "sgx.h" + +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, +}; + +/** + * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr + * + * ENCLS has its own (positive value) error codes and also generates + * ENCLS specific #GP and #PF faults. And the ENCLS values get munged + * with system error codes as everything percolates back up the stack. + * Unfortunately (for us), we need to precisely identify each unique + * error code, e.g. the action taken if EWB fails varies based on the + * type of fault and on the exact SGX error code, i.e. we can't simply + * convert all faults to -EFAULT. + * + * To make all three error types coexist, we set bit 30 to identify an + * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate + * between positive (faults and SGX error codes) and negative (system + * error codes) values. + */ +#define ENCLS_FAULT_FLAG 0x40000000 + +/* Retrieve the encoded trapnr from the specified return code. */ +#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) + +/* Issue a WARN() about an ENCLS function. */ +#define ENCLS_WARN(r, name) { \ + do { \ + int _r = (r); \ + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \ + } while (0); \ +} + +/** + * encls_failed() - Check if an ENCLS function failed + * @ret: the return value of an ENCLS function call + * + * Check if an ENCLS function failed. This happens when the function causes a + * fault that is not caused by an EPCM conflict or when the function returns a + * non-zero value. + */ +static inline bool encls_failed(int ret) +{ + if (ret & ENCLS_FAULT_FLAG) + return ENCLS_TRAPNR(ret) != X86_TRAP_PF; + + return !!ret; +} + +/** + * __encls_ret_N - encode an ENCLS function that returns an error code in EAX + * @rax: function number + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE. + * And because SGX isn't complex enough as it is, function that return an error + * code also modify flags. + * + * Return: + * 0 on success, + * SGX error code on failure + */ +#define __encls_ret_N(rax, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret) \ + : "a"(rax), inputs \ + : "memory", "cc"); \ + ret; \ + }) + +#define __encls_ret_1(rax, rcx) \ + ({ \ + __encls_ret_N(rax, "c"(rcx)); \ + }) + +#define __encls_ret_2(rax, rbx, rcx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_ret_3(rax, rbx, rcx, rdx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \ + }) + +/** + * __encls_N - encode an ENCLS function that doesn't return an error code + * @rax: function number + * @rbx_out: optional output variable + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that does not return an error code, e.g. + * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an + * optional parameter for use by EDGBRD, which returns the requested value in + * RBX. + * + * Return: + * 0 on success, + * trapnr with ENCLS_FAULT_FLAG set on fault + */ +#define __encls_N(rax, rbx_out, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + " xor %%eax,%%eax;\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret), "=b"(rbx_out) \ + : "a"(rax), inputs \ + : "memory"); \ + ret; \ + }) + +#define __encls_2(rax, rbx, rcx) \ + ({ \ + unsigned long ign_rbx_out; \ + __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_1_1(rax, data, rcx) \ + ({ \ + unsigned long rbx_out; \ + int ret = __encls_N(rax, rbx_out, "c"(rcx)); \ + if (!ret) \ + data = rbx_out; \ + ret; \ + }) + +static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) +{ + return __encls_2(ECREATE, pginfo, secs); +} + +static inline int __eextend(void *secs, void *addr) +{ + return __encls_2(EEXTEND, secs, addr); +} + +static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EADD, pginfo, addr); +} + +static inline int __einit(void *sigstruct, void *token, void *secs) +{ + return __encls_ret_3(EINIT, sigstruct, secs, token); +} + +static inline int __eremove(void *addr) +{ + return __encls_ret_1(EREMOVE, addr); +} + +static inline int __edbgwr(void *addr, unsigned long *data) +{ + return __encls_2(EDGBWR, *data, addr); +} + +static inline int __edbgrd(void *addr, unsigned long *data) +{ + return __encls_1_1(EDGBRD, *data, addr); +} + +static inline int __etrack(void *addr) +{ + return __encls_ret_1(ETRACK, addr); +} + +static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(ELDU, pginfo, addr, va); +} + +static inline int __eblock(void *addr) +{ + return __encls_ret_1(EBLOCK, addr); +} + +static inline int __epa(void *addr) +{ + unsigned long rbx = SGX_PAGE_TYPE_VA; + + return __encls_2(EPA, rbx, addr); +} + +static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(EWB, pginfo, addr, va); +} + +#endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c new file mode 100644 index 000000000000..90a5caf76939 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <asm/mman.h> +#include <linux/mman.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/hashtable.h> +#include <linux/highmem.h> +#include <linux/ratelimit.h> +#include <linux/sched/signal.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +{ + struct sgx_va_page *va_page = NULL; + void *err; + + BUILD_BUG_ON(SGX_VA_SLOT_COUNT != + (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1); + + if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) { + va_page = kzalloc(sizeof(*va_page), GFP_KERNEL); + if (!va_page) + return ERR_PTR(-ENOMEM); + + va_page->epc_page = sgx_alloc_va_page(); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); + return err; + } + + WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT); + } + encl->page_cnt++; + return va_page; +} + +static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +{ + encl->page_cnt--; + + if (va_page) { + sgx_free_epc_page(va_page->epc_page); + list_del(&va_page->list); + kfree(va_page); + } +} + +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) +{ + struct sgx_epc_page *secs_epc; + struct sgx_va_page *va_page; + struct sgx_pageinfo pginfo; + struct sgx_secinfo secinfo; + unsigned long encl_size; + struct file *backing; + long ret; + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) + list_add(&va_page->list, &encl->va_pages); + /* else the tail page of the VA page list had free slots. */ + + /* The extra page goes to SECS. */ + encl_size = secs->size + PAGE_SIZE; + + backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), + VM_NORESERVE); + if (IS_ERR(backing)) { + ret = PTR_ERR(backing); + goto err_out_shrink; + } + + encl->backing = backing; + + secs_epc = sgx_alloc_epc_page(&encl->secs, true); + if (IS_ERR(secs_epc)) { + ret = PTR_ERR(secs_epc); + goto err_out_backing; + } + + encl->secs.epc_page = secs_epc; + + pginfo.addr = 0; + pginfo.contents = (unsigned long)secs; + pginfo.metadata = (unsigned long)&secinfo; + pginfo.secs = 0; + memset(&secinfo, 0, sizeof(secinfo)); + + ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc)); + if (ret) { + ret = -EIO; + goto err_out; + } + + if (secs->attributes & SGX_ATTR_DEBUG) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; + encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + + /* Set only after completion, as encl->lock has not been taken. */ + set_bit(SGX_ENCL_CREATED, &encl->flags); + + return 0; + +err_out: + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + +err_out_backing: + fput(encl->backing); + encl->backing = NULL; + +err_out_shrink: + sgx_encl_shrink(encl, va_page); + + return ret; +} + +/** + * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE + * @encl: An enclave pointer. + * @arg: The ioctl argument. + * + * Allocate kernel data structures for the enclave and invoke ECREATE. + * + * Return: + * - 0: Success. + * - -EIO: ECREATE failed. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_create create_arg; + void *secs; + int ret; + + if (test_bit(SGX_ENCL_CREATED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&create_arg, arg, sizeof(create_arg))) + return -EFAULT; + + secs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!secs) + return -ENOMEM; + + if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE)) + ret = -EFAULT; + else + ret = sgx_encl_create(encl, secs); + + kfree(secs); + return ret; +} + +static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) +{ + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK; + + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS) + return -EINVAL; + + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) + return -EINVAL; + + /* + * CPU will silently overwrite the permissions as zero, which means + * that we need to validate it ourselves. + */ + if (pt == SGX_SECINFO_TCS && perm) + return -EINVAL; + + if (secinfo->flags & SGX_SECINFO_RESERVED_MASK) + return -EINVAL; + + if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved))) + return -EINVAL; + + return 0; +} + +static int __sgx_encl_add_page(struct sgx_encl *encl, + struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_secinfo *secinfo, unsigned long src) +{ + struct sgx_pageinfo pginfo; + struct vm_area_struct *vma; + struct page *src_page; + int ret; + + /* Deny noexec. */ + vma = find_vma(current->mm, src); + if (!vma) + return -EFAULT; + + if (!(vma->vm_flags & VM_MAYEXEC)) + return -EACCES; + + ret = get_user_pages(src, 1, 0, &src_page, NULL); + if (ret < 1) + return -EFAULT; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = (unsigned long)secinfo; + pginfo.contents = (unsigned long)kmap_atomic(src_page); + + ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); + + kunmap_atomic((void *)pginfo.contents); + put_page(src_page); + + return ret ? -EIO : 0; +} + +/* + * If the caller requires measurement of the page as a proof for the content, + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this + * operation until the entire page is measured." + */ +static int __sgx_encl_extend(struct sgx_encl *encl, + struct sgx_epc_page *epc_page) +{ + unsigned long offset; + int ret; + + for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) { + ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page), + sgx_get_epc_virt_addr(epc_page) + offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EEXTEND"); + + return -EIO; + } + } + + return 0; +} + +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + unsigned long offset, struct sgx_secinfo *secinfo, + unsigned long flags) +{ + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + int ret; + + encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + kfree(encl_page); + return PTR_ERR(epc_page); + } + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; + } + + mmap_read_lock(current->mm); + mutex_lock(&encl->lock); + + /* + * Adding to encl->va_pages must be done under encl->lock. Ditto for + * deleting (via sgx_encl_shrink()) in the error path. + */ + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + /* + * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. + * can't be gracefully unwound, while failure on EADD/EXTEND is limited + * to userspace errors (or kernel/hardware bugs). + */ + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + if (ret) + goto err_out_unlock; + + ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo, + src); + if (ret) + goto err_out; + + /* + * Complete the "add" before doing the "extend" so that the "add" + * isn't in a half-baked state in the extremely unlikely scenario + * the enclave will be destroyed in response to EEXTEND failure. + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { + ret = __sgx_encl_extend(encl, epc_page); + if (ret) + goto err_out; + } + + sgx_mark_page_reclaimable(encl_page->epc_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + return ret; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_unlock: + sgx_encl_shrink(encl, va_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + +err_out_free: + sgx_free_epc_page(epc_page); + kfree(encl_page); + + return ret; +} + +/** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer + * @arg: a user pointer to a struct sgx_enclave_add_pages instance + * + * Add one or more pages to an uninitialized enclave, and optionally extend the + * measurement with the contents of the page. The SECINFO and measurement mask + * are applied to all pages. + * + * A SECINFO for a TCS is required to always contain zero permissions because + * CPU silently zeros them. Allowing anything else would cause a mismatch in + * the measurement. + * + * mmap()'s protection bits are capped by the page permissions. For each page + * address, the maximum protection bits are computed with the following + * heuristics: + * + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions. + * 2. A TCS page: PROT_R | PROT_W. + * + * mmap() is not allowed to surpass the minimum of the maximum protection bits + * within the given address range. + * + * The function deinitializes kernel data structures for enclave and returns + * -EIO in any of the following conditions: + * + * - Enclave Page Cache (EPC), the physical memory holding enclaves, has + * been invalidated. This will cause EADD and EEXTEND to fail. + * - If the source address is corrupted somehow when executing EADD. + * + * Return: + * - 0: Success. + * - -EACCES: The source page is located in a noexec partition. + * - -ENOMEM: Out of EPC pages. + * - -EINTR: The call was interrupted before data was processed. + * - -EIO: Either EADD or EEXTEND failed because invalid source address + * or power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_add_pages add_arg; + struct sgx_secinfo secinfo; + unsigned long c; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + + if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || + !IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + + if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + return -EINVAL; + + if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, + sizeof(secinfo))) + return -EFAULT; + + if (sgx_validate_secinfo(&secinfo)) + return -EINVAL; + + for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) { + if (signal_pending(current)) { + if (!c) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c, + &secinfo, add_arg.flags); + if (ret) + break; + } + + add_arg.count = c; + + if (copy_to_user(arg, &add_arg, sizeof(add_arg))) + return -EFAULT; + + return ret; +} + +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, + void *hash) +{ + SHASH_DESC_ON_STACK(shash, tfm); + + shash->tfm = tfm; + + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); +} + +static int sgx_get_key_hash(const void *modulus, void *hash) +{ + struct crypto_shash *tfm; + int ret; + + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + ret = __sgx_get_key_hash(tfm, modulus, hash); + + crypto_free_shash(tfm); + return ret; +} + +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, + void *token) +{ + u64 mrsigner[4]; + int i, j, k; + void *addr; + int ret; + + /* + * Deny initializing enclaves with attributes (namely provisioning) + * that have not been explicitly allowed. + */ + if (encl->attributes & ~encl->attributes_mask) + return -EACCES; + + /* + * Attributes should not be enforced *only* against what's available on + * platform (done in sgx_encl_create) but checked and enforced against + * the mask for enforcement in sigstruct. For example an enclave could + * opt to sign with AVX bit in xfrm, but still be loadable on a platform + * without it if the sigstruct->body.attributes_mask does not turn that + * bit on. + */ + if (sigstruct->body.attributes & sigstruct->body.attributes_mask & + sgx_attributes_reserved_mask) + return -EINVAL; + + if (sigstruct->body.miscselect & sigstruct->body.misc_mask & + sgx_misc_reserved_mask) + return -EINVAL; + + if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask & + sgx_xfrm_reserved_mask) + return -EINVAL; + + ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); + if (ret) + return ret; + + mutex_lock(&encl->lock); + + /* + * ENCLS[EINIT] is interruptible because it has such a high latency, + * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending, + * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be + * serviced. + */ + for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { + for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { + addr = sgx_get_epc_virt_addr(encl->secs.epc_page); + + preempt_disable(); + + for (k = 0; k < 4; k++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + + ret = __einit(sigstruct, token, addr); + + preempt_enable(); + + if (ret == SGX_UNMASKED_EVENT) + continue; + else + break; + } + + if (ret != SGX_UNMASKED_EVENT) + break; + + msleep_interruptible(SGX_EINIT_SLEEP_TIME); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto err_out; + } + } + + if (ret & ENCLS_FAULT_FLAG) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EINIT"); + + ret = -EIO; + } else if (ret) { + pr_debug("EINIT returned %d\n", ret); + ret = -EPERM; + } else { + set_bit(SGX_ENCL_INITIALIZED, &encl->flags); + } + +err_out: + mutex_unlock(&encl->lock); + return ret; +} + +/** + * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_init instance + * + * Flush any outstanding enqueued EADD operations and perform EINIT. The + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match + * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * + * Return: + * - 0: Success. + * - -EPERM: Invalid SIGSTRUCT. + * - -EIO: EINIT failed because of a power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_sigstruct *sigstruct; + struct sgx_enclave_init init_arg; + struct page *initp_page; + void *token; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&init_arg, arg, sizeof(init_arg))) + return -EFAULT; + + initp_page = alloc_page(GFP_KERNEL); + if (!initp_page) + return -ENOMEM; + + sigstruct = kmap(initp_page); + token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); + memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); + + if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct, + sizeof(*sigstruct))) { + ret = -EFAULT; + goto out; + } + + /* + * A legacy field used with Intel signed enclaves. These used to mean + * regular and architectural enclaves. The CPU only accepts these values + * but they do not have any other meaning. + * + * Thus, reject any other values. + */ + if (sigstruct->header.vendor != 0x0000 && + sigstruct->header.vendor != 0x8086) { + ret = -EINVAL; + goto out; + } + + ret = sgx_encl_init(encl, sigstruct, token); + +out: + kunmap(initp_page); + __free_page(initp_page); + return ret; +} + +/** + * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_provision instance + * + * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to + * /dev/sgx_provision. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_provision params; + struct file *file; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + file = fget(params.fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct sgx_encl *encl = filep->private_data; + int ret; + + if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags)) + return -EBUSY; + + switch (cmd) { + case SGX_IOC_ENCLAVE_CREATE: + ret = sgx_ioc_enclave_create(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_ADD_PAGES: + ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_INIT: + ret = sgx_ioc_enclave_init(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + clear_bit(SGX_ENCL_IOCTL, &encl->flags); + return ret; +} diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c new file mode 100644 index 000000000000..8df81a3ed945 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/freezer.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/ratelimit.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; +static int sgx_nr_epc_sections; +static struct task_struct *ksgxd_tsk; +static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); + +/* + * These variables are part of the state of the reclaimer, and must be accessed + * with sgx_reclaimer_lock acquired. + */ +static LIST_HEAD(sgx_active_page_list); + +static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +/* + * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS + * pages whose child pages blocked EREMOVE. + */ +static void sgx_sanitize_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + LIST_HEAD(dirty); + int ret; + + /* init_laundry_list is thread-local, no need for a lock: */ + while (!list_empty(§ion->init_laundry_list)) { + if (kthread_should_stop()) + return; + + /* needed for access to ->page_list: */ + spin_lock(§ion->lock); + + page = list_first_entry(§ion->init_laundry_list, + struct sgx_epc_page, list); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (!ret) + list_move(&page->list, §ion->page_list); + else + list_move_tail(&page->list, &dirty); + + spin_unlock(§ion->lock); + + cond_resched(); + } + + list_splice(&dirty, §ion->init_laundry_list); +} + +static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + struct sgx_encl *encl = page->encl; + struct sgx_encl_mm *encl_mm; + bool ret = true; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + + if (!ret) + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + if (!ret) + return false; + + return true; +} + +static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); + + mutex_lock(&encl->lock); + + ret = __eblock(sgx_get_epc_virt_addr(epc_page)); + if (encls_failed(ret)) + ENCLS_WARN(ret, "EBLOCK"); + + mutex_unlock(&encl->lock); +} + +static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + struct sgx_backing *backing) +{ + struct sgx_pageinfo pginfo; + int ret; + + pginfo.addr = 0; + pginfo.secs = 0; + + pginfo.contents = (unsigned long)kmap_atomic(backing->contents); + pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + backing->pcmd_offset; + + ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + backing->pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + return ret; +} + +static void sgx_ipi_cb(void *info) +{ +} + +static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * Can race with sgx_encl_mm_add(), but ETRACK has already been + * executed, which means that the CPUs running in the new mm will enter + * into the enclave with a fresh epoch. + */ + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + +/* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * + * The first trial just tries to write the page assuming that some other thread + * has reset the count for threads inside the enlave by using ETRACK, and + * previous thread count has been zeroed out. The second trial calls ETRACK + * before EWB. If that fails we kick all the HW threads out, and then do EWB, + * which should be guaranteed the succeed. + */ +static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_va_page *va_page; + unsigned int va_offset; + void *va_slot; + int ret; + + encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; + + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + va_offset = sgx_alloc_va_slot(va_page); + va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; + if (sgx_va_page_full(va_page)) + list_move_tail(&va_page->list, &encl->va_pages); + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ETRACK"); + } + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + /* + * Slow path, send IPIs to kick cpus out of the + * enclave. Note, it's imperative that the cpu + * mask is generated *after* ETRACK, else we'll + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ + on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } + } + + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EWB"); + + sgx_free_va_slot(va_page, va_offset); + } else { + encl_page->desc |= va_offset; + encl_page->va_page = va_page; + } +} + +static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_backing secs_backing; + int ret; + + mutex_lock(&encl->lock); + + sgx_encl_ewb(epc_page, backing); + encl_page->epc_page = NULL; + encl->secs_child_cnt--; + + if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { + ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + &secs_backing); + if (ret) + goto out; + + sgx_encl_ewb(encl->secs.epc_page, &secs_backing); + + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + + sgx_encl_put_backing(&secs_backing, true); + } + +out: + mutex_unlock(&encl->lock); +} + +/* + * Take a fixed number of pages from the head of the active page pool and + * reclaim them to the enclave's private shmem files. Skip the pages, which have + * been accessed since the last scan. Move those pages to the tail of active + * page pool so that the pages get scanned in LRU like fashion. + * + * Batch process a chunk of pages (at the moment 16) in order to degrade amount + * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit + * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI + * + EWB) but not sufficiently. Reclaiming one page at a time would also be + * problematic as it would increase the lock contention too much, which would + * halt forward progress. + */ +static void sgx_reclaim_pages(void) +{ + struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; + struct sgx_backing backing[SGX_NR_TO_SCAN]; + struct sgx_epc_section *section; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + pgoff_t page_index; + int cnt = 0; + int ret; + int i; + + spin_lock(&sgx_reclaimer_lock); + for (i = 0; i < SGX_NR_TO_SCAN; i++) { + if (list_empty(&sgx_active_page_list)) + break; + + epc_page = list_first_entry(&sgx_active_page_list, + struct sgx_epc_page, list); + list_del_init(&epc_page->list); + encl_page = epc_page->owner; + + if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) + chunk[cnt++] = epc_page; + else + /* The owner is freeing the page. No need to add the + * page back to the list of reclaimable pages. + */ + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + encl_page = epc_page->owner; + + if (!sgx_reclaimer_age(epc_page)) + goto skip; + + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); + if (ret) + goto skip; + + mutex_lock(&encl_page->encl->lock); + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; + mutex_unlock(&encl_page->encl->lock); + continue; + +skip: + spin_lock(&sgx_reclaimer_lock); + list_add_tail(&epc_page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + + chunk[i] = NULL; + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (epc_page) + sgx_reclaimer_block(epc_page); + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (!epc_page) + continue; + + encl_page = epc_page->owner; + sgx_reclaimer_write(epc_page, &backing[i]); + sgx_encl_put_backing(&backing[i], true); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + + section = &sgx_epc_sections[epc_page->section]; + spin_lock(§ion->lock); + list_add_tail(&epc_page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); + } +} + +static unsigned long sgx_nr_free_pages(void) +{ + unsigned long cnt = 0; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) + cnt += sgx_epc_sections[i].free_cnt; + + return cnt; +} + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return sgx_nr_free_pages() < watermark && + !list_empty(&sgx_active_page_list); +} + +static int ksgxd(void *p) +{ + int i; + + set_freezable(); + + /* + * Sanitize pages in order to recover from kexec(). The 2nd pass is + * required for SECS pages, whose child pages blocked EREMOVE. + */ + for (i = 0; i < sgx_nr_epc_sections; i++) + sgx_sanitize_section(&sgx_epc_sections[i]); + + for (i = 0; i < sgx_nr_epc_sections; i++) { + sgx_sanitize_section(&sgx_epc_sections[i]); + + /* Should never happen. */ + if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) + WARN(1, "EPC section %d has unsanitized pages.\n", i); + } + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_freezable(ksgxd_waitq, + kthread_should_stop() || + sgx_should_reclaim(SGX_NR_HIGH_PAGES)); + + if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) + sgx_reclaim_pages(); + + cond_resched(); + } + + return 0; +} + +static bool __init sgx_page_reclaimer_init(void) +{ + struct task_struct *tsk; + + tsk = kthread_run(ksgxd, NULL, "ksgxd"); + if (IS_ERR(tsk)) + return false; + + ksgxd_tsk = tsk; + + return true; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + + spin_lock(§ion->lock); + + if (list_empty(§ion->page_list)) { + spin_unlock(§ion->lock); + return NULL; + } + + page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + list_del_init(&page->list); + section->free_cnt--; + + spin_unlock(§ion->lock); + return page; +} + +/** + * __sgx_alloc_epc_page() - Allocate an EPC page + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *__sgx_alloc_epc_page(void) +{ + struct sgx_epc_section *section; + struct sgx_epc_page *page; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) { + section = &sgx_epc_sections[i]; + + page = __sgx_alloc_epc_page_from_section(section); + if (page) + return page; + } + + return ERR_PTR(-ENOMEM); +} + +/** + * sgx_mark_page_reclaimable() - Mark a page as reclaimable + * @page: EPC page + * + * Mark a page as reclaimable and add it to the active page list. Pages + * are automatically removed from the active list when freed. + */ +void sgx_mark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; + list_add_tail(&page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); +} + +/** + * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list + * @page: EPC page + * + * Clear the reclaimable flag and remove the page from the active page list. + * + * Return: + * 0 on success, + * -EBUSY if the page is in the process of being reclaimed + */ +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { + /* The page is being reclaimed. */ + if (list_empty(&page->list)) { + spin_unlock(&sgx_reclaimer_lock); + return -EBUSY; + } + + list_del(&page->list); + page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + return 0; +} + +/** + * sgx_alloc_epc_page() - Allocate an EPC page + * @owner: the owner of the EPC page + * @reclaim: reclaim pages if necessary + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). If + * @reclaim is set to true, directly reclaim pages when we are out of pages. No + * mm's can be locked when @reclaim is set to true. + * + * Finally, wake up ksgxd when the number of pages goes below the watermark + * before returning back to the caller. + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) +{ + struct sgx_epc_page *page; + + for ( ; ; ) { + page = __sgx_alloc_epc_page(); + if (!IS_ERR(page)) { + page->owner = owner; + break; + } + + if (list_empty(&sgx_active_page_list)) + return ERR_PTR(-ENOMEM); + + if (!reclaim) { + page = ERR_PTR(-EBUSY); + break; + } + + if (signal_pending(current)) { + page = ERR_PTR(-ERESTARTSYS); + break; + } + + sgx_reclaim_pages(); + cond_resched(); + } + + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + wake_up(&ksgxd_waitq); + + return page; +} + +/** + * sgx_free_epc_page() - Free an EPC page + * @page: an EPC page + * + * Call EREMOVE for an EPC page and insert it back to the list of free pages. + */ +void sgx_free_epc_page(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) + return; + + spin_lock(§ion->lock); + list_add_tail(&page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); +} + +static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, + unsigned long index, + struct sgx_epc_section *section) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long i; + + section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!section->virt_addr) + return false; + + section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + if (!section->pages) { + memunmap(section->virt_addr); + return false; + } + + section->phys_addr = phys_addr; + spin_lock_init(§ion->lock); + INIT_LIST_HEAD(§ion->page_list); + INIT_LIST_HEAD(§ion->init_laundry_list); + + for (i = 0; i < nr_pages; i++) { + section->pages[i].section = index; + section->pages[i].flags = 0; + section->pages[i].owner = NULL; + list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + } + + section->free_cnt = nr_pages; + return true; +} + +/** + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +static bool __init sgx_page_cache_init(void) +{ + u32 eax, ebx, ecx, edx, type; + u64 pa, size; + int i; + + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { + cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) { + pr_err_once("Unknown EPC section type: %u\n", type); + break; + } + + pa = sgx_calc_section_metric(eax, ebx); + size = sgx_calc_section_metric(ecx, edx); + + pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); + + if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { + pr_err("No free memory for an EPC section\n"); + break; + } + + sgx_nr_epc_sections++; + } + + if (!sgx_nr_epc_sections) { + pr_err("There are zero EPC sections.\n"); + return false; + } + + return true; +} + +static int __init sgx_init(void) +{ + int ret; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_SGX)) + return -ENODEV; + + if (!sgx_page_cache_init()) + return -ENOMEM; + + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; + goto err_page_cache; + } + + ret = sgx_drv_init(); + if (ret) + goto err_kthread; + + return 0; + +err_kthread: + kthread_stop(ksgxd_tsk); + +err_page_cache: + for (i = 0; i < sgx_nr_epc_sections; i++) { + vfree(sgx_epc_sections[i].pages); + memunmap(sgx_epc_sections[i].virt_addr); + } + + return ret; +} + +device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h new file mode 100644 index 000000000000..5fa42d143feb --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_SGX_H +#define _X86_SGX_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include "arch.h" + +#undef pr_fmt +#define pr_fmt(fmt) "sgx: " fmt + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_EEXTEND_BLOCK_SIZE 256 +#define SGX_NR_TO_SCAN 16 +#define SGX_NR_LOW_PAGES 32 +#define SGX_NR_HIGH_PAGES 64 + +/* Pages, which are being tracked by the page reclaimer. */ +#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) + +struct sgx_epc_page { + unsigned int section; + unsigned int flags; + struct sgx_encl_page *owner; + struct list_head list; +}; + +/* + * The firmware can define multiple chunks of EPC to the different areas of the + * physical memory e.g. for memory areas of the each node. This structure is + * used to store EPC pages for one EPC section and virtual memory area where + * the pages have been mapped. + * + * 'lock' must be held before accessing 'page_list' or 'free_cnt'. + */ +struct sgx_epc_section { + unsigned long phys_addr; + void *virt_addr; + struct sgx_epc_page *pages; + + spinlock_t lock; + struct list_head page_list; + unsigned long free_cnt; + + /* + * Pages which need EREMOVE run on them before they can be + * used. Only safe to be accessed in ksgxd and init code. + * Not protected by locks. + */ + struct list_head init_laundry_list; +}; + +extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; + +static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->phys_addr + index * PAGE_SIZE; +} + +static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->virt_addr + index * PAGE_SIZE; +} + +struct sgx_epc_page *__sgx_alloc_epc_page(void); +void sgx_free_epc_page(struct sgx_epc_page *page); + +void sgx_mark_page_reclaimable(struct sgx_epc_page *page); +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + +#endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index d3a0791bc052..8678864ce712 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -25,10 +25,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -#ifdef CONFIG_SMP unsigned int __max_die_per_package __read_mostly = 1; EXPORT_SYMBOL(__max_die_per_package); +#ifdef CONFIG_SMP /* * Check if given CPUID extended toplogy "leaf" is implemented */ @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + bool die_level_present = false; int leaf; leaf = detect_extended_topology_leaf(c); @@ -126,6 +127,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_present = true; die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } @@ -139,8 +141,12 @@ int detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, - core_plus_mask_width) & die_select_mask; + + if (die_level_present) { + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + } + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, die_plus_mask_width); /* diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 924571fe5864..c6ede3b3d302 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -501,12 +501,12 @@ static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) ghcb_rbp_is_valid(ghcb))) return false; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - regs->si = ghcb->save.rsi; - regs->di = ghcb->save.rdi; - regs->bp = ghcb->save.rbp; + regs->bx = ghcb_get_rbx(ghcb); + regs->cx = ghcb_get_rcx(ghcb); + regs->dx = ghcb_get_rdx(ghcb); + regs->si = ghcb_get_rsi(ghcb); + regs->di = ghcb_get_rdi(ghcb); + regs->bp = ghcb_get_rbp(ghcb); return true; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3492aa36bf09..6f7b8cc1bc9f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, init_completion(&cmd.done); for (; count; count -= 16) { - call_single_data_t csd = { - .func = cpuid_smp_cpuid, - .info = &cmd, - }; + call_single_data_t csd; + + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); cmd.regs.eax = pos; cmd.regs.ecx = pos >> 32; diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 33ee47670b99..5fcac46aaf6b 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,8 +13,6 @@ #include <linux/uaccess.h> -static void *kdump_buf_page; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE @@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. + * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) { void *vaddr; @@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn); + vaddr = kmap_local_pfn(pfn); if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); + memcpy(buf, vaddr + offset, csize); } else { - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Kdump buffer page not" - " allocated\n"); - kunmap_atomic(vaddr); - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; + if (copy_to_user(buf, vaddr + offset, csize)) + csize = -EFAULT; } - return csize; -} + kunmap_local(vaddr); -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" - " page\n"); - ret = -ENOMEM; - } - - return ret; + return csize; } -arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index ddffd80f5c52..6a4cb71c2498 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -184,31 +184,31 @@ static unsigned int ioapic_id; struct of_ioapic_type { u32 out_type; - u32 trigger; - u32 polarity; + u32 is_level; + u32 active_low; }; static struct of_ioapic_type of_ioapic_type[] = { { - .out_type = IRQ_TYPE_EDGE_RISING, - .trigger = IOAPIC_EDGE, - .polarity = 1, + .out_type = IRQ_TYPE_EDGE_FALLING, + .is_level = 0, + .active_low = 1, }, { - .out_type = IRQ_TYPE_LEVEL_LOW, - .trigger = IOAPIC_LEVEL, - .polarity = 0, + .out_type = IRQ_TYPE_LEVEL_HIGH, + .is_level = 1, + .active_low = 0, }, { - .out_type = IRQ_TYPE_LEVEL_HIGH, - .trigger = IOAPIC_LEVEL, - .polarity = 1, + .out_type = IRQ_TYPE_LEVEL_LOW, + .is_level = 1, + .active_low = 1, }, { - .out_type = IRQ_TYPE_EDGE_FALLING, - .trigger = IOAPIC_EDGE, - .polarity = 0, + .out_type = IRQ_TYPE_EDGE_RISING, + .is_level = 0, + .active_low = 0, }, }; @@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; it = &of_ioapic_type[type_index]; - ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low); tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); tmp.ioapic.pin = fwspec->param[0]; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 97aa900386cb..299c20f0a38b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -183,7 +183,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } -void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { struct unwind_state state; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1dd851397bd9..5601b95944fa 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -128,12 +128,21 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); - unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin; /* - * This is a software stack, so 'end' can be a valid stack pointer. - * It just means the stack is empty. + * @end points directly to the top most stack entry to avoid a -8 + * adjustment in the stack switch hotpath. Adjust it back before + * calculating @begin. + */ + end++; + begin = end - (IRQ_STACK_SIZE / sizeof(long)); + + /* + * Due to the switching logic RSP can never be == @end because the + * final operation is 'popq %rsp' which means after that RSP points + * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; @@ -143,8 +152,9 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info info->end = end; /* - * The next stack pointer is the first thing pushed by the entry code - * after switching to the irq stack. + * The next stack pointer is stored at the top of the irq stack + * before switching to the irq stack. Actual stack entries are all + * below that. */ info->next_sp = (unsigned long *)*(end - 1); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eb86a2b831b1..571220ac8bea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu) } EXPORT_SYMBOL(copy_fpregs_to_fpstate); -void kernel_fpu_begin(void) +void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); @@ -141,13 +141,14 @@ void kernel_fpu_begin(void) } __cpu_invalidate_fpregs_state(); - if (boot_cpu_has(X86_FEATURE_XMM)) + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); - if (boot_cpu_has(X86_FEATURE_FPU)) + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } -EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5d8047441a0a..683749b80ae2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu) fx->fop = 0; fx->rip = 0; fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); + memset(fx->st_space, 0, sizeof(fx->st_space)); } /* * SSE is in init state */ if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(&fx->xmm_space[0], 0, 256); + memset(fx->xmm_space, 0, sizeof(fx->xmm_space)); /* * First two features are FPU and SSE, which above we handled diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index ac3d5f22fe64..7c273846c687 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -140,16 +140,27 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + restore_mcount_regs /* @@ -173,6 +184,7 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) * It is also used to copy the retq for trampolines. */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) + UNWIND_HINT_FUNC retq SYM_FUNC_END(ftrace_epilogue) @@ -265,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) restore_mcount_regs 8 /* Restore flags */ popfq - UNWIND_HINT_RET_OFFSET + UNWIND_HINT_FUNC jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) @@ -322,8 +334,7 @@ SYM_FUNC_START(ftrace_graph_caller) retq SYM_FUNC_END(ftrace_graph_caller) -SYM_CODE_START(return_to_handler) - UNWIND_HINT_EMPTY +SYM_FUNC_START(return_to_handler) subq $24, %rsp /* Save the return values */ @@ -338,5 +349,5 @@ SYM_CODE_START(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC rdi -SYM_CODE_END(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 05e117137b45..5e9beb77cafd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -37,7 +37,6 @@ #include <asm/kasan.h> #include <asm/fixmap.h> #include <asm/realmode.h> -#include <asm/desc.h> #include <asm/extable.h> #include <asm/trapnr.h> #include <asm/sev-es.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3c417734790f..04bddaaba8e2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -26,15 +26,6 @@ #include <asm/nospec-branch.h> #include <asm/fixmap.h> -#ifdef CONFIG_PARAVIRT_XXL -#include <asm/asm-offsets.h> -#include <asm/paravirt.h> -#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg -#else -#define INTERRUPT_RETURN iretq -#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg -#endif - /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. @@ -540,21 +531,19 @@ SYM_DATA_END(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* - * 512 MB kernel mapping. We spend a full page on this pagetable - * anyway. + * Kernel high mapping. * - * The kernel code+data+bss must not be bigger than that. + * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in + * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled, + * 512 MiB otherwise. * - * (NOTE: at +512MB starts the module area, see MODULES_VADDR. - * If you want to increase this then increase MODULES_VADDR - * too.) + * (NOTE: after that starts the module area, see MODULES_VADDR.) * - * This table is eventually used by the kernel during normal - * runtime. Care must be taken to clear out undesired bits - * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. + * This table is eventually used by the kernel during normal runtime. + * Care must be taken to clear out undesired bits later, like _PAGE_RW + * or _PAGE_GLOBAL in some cases. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, - KERNEL_IMAGE_SIZE/PMD_SIZE) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7a50f0b62a70..08651a4e6aa0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include <linux/cpu.h> #include <linux/irq.h> +#include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> @@ -50,7 +51,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_PCI_MSI +#ifdef CONFIG_GENERIC_MSI_IRQ static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_PCI_MSI - -void hpet_msi_unmask(struct irq_data *data) +#ifdef CONFIG_GENERIC_MSI_IRQ +static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_mask(struct irq_data *data) +static void hpet_msi_mask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -489,12 +489,122 @@ void hpet_msi_mask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct fwnode_handle *fn; + struct irq_fwspec fwspec; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = hpet_id; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + return NULL; + } + if (parent != x86_vector_domain) + hpet_msi_controller.name = "IR-HPET-MSI"; + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} + static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { struct hpet_channel *hc = clockevent_to_channel(evt); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 03aa33b58165..668a4a6533d9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) CPU_ENTRY_AREA_TOTAL_SIZE)) return true; + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + for_each_possible_cpu(cpu) { /* The original rw GDT is being used after load_direct_gdt() */ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), @@ -293,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) (unsigned long)&per_cpu(cpu_tlbstate, cpu), sizeof(struct tlb_state))) return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; } return false; @@ -491,15 +513,12 @@ static int hw_breakpoint_handler(struct die_args *args) struct perf_event *bp; unsigned long *dr6_p; unsigned long dr6; + bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; - /* If it's a single step, TRAP bits are random */ - if (dr6 & DR_STEP) - return NOTIFY_DONE; - /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; @@ -509,28 +528,29 @@ static int hw_breakpoint_handler(struct die_args *args) if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + /* - * The counter may be concurrently released but that can only - * occur from a call_rcu() path. We can then safely fetch - * the breakpoint, use its callback, touch its counter - * while we are in an rcu_read_lock() path. + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. */ - rcu_read_lock(); + if (bpx && (dr6 & DR_STEP)) + continue; - bp = this_cpu_read(bp_per_reg[i]); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); - /* - * bp can be NULL due to lazy debug register switching - * or due to concurrent perf counter removing. - */ - if (!bp) { - rcu_read_unlock(); - break; - } perf_bp_event(bp, args->regs); @@ -538,11 +558,10 @@ static int hw_breakpoint_handler(struct die_args *args) * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ - if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + if (bpx) args->regs->flags |= X86_EFLAGS_RF; - - rcu_read_unlock(); } + /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c deleted file mode 100644 index 7dfb1e808928..000000000000 --- a/arch/x86/kernel/ima_arch.c +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright (C) 2018 IBM Corporation - */ -#include <linux/efi.h> -#include <linux/module.h> -#include <linux/ima.h> - -extern struct boot_params boot_params; - -static enum efi_secureboot_mode get_sb_mode(void) -{ - efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; - efi_status_t status; - unsigned long size; - u8 secboot, setupmode; - - size = sizeof(secboot); - - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { - pr_info("ima: secureboot mode unknown, no efi\n"); - return efi_secureboot_mode_unknown; - } - - /* Get variable contents into buffer */ - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - if (status == EFI_NOT_FOUND) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - if (status != EFI_SUCCESS) { - pr_info("ima: secureboot mode unknown\n"); - return efi_secureboot_mode_unknown; - } - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ - setupmode = 0; - - if (secboot == 0 || setupmode == 1) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - pr_info("ima: secureboot mode enabled\n"); - return efi_secureboot_mode_enabled; -} - -bool arch_ima_get_secureboot(void) -{ - static enum efi_secureboot_mode sb_mode; - static bool initialized; - - if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = boot_params.secure_boot; - - if (sb_mode == efi_secureboot_mode_unset) - sb_mode = get_sb_mode(); - initialized = true; - } - - if (sb_mode == efi_secureboot_mode_enabled) - return true; - else - return false; -} - -/* secureboot arch rules */ -static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_SIG) - "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_SIG */ - "measure func=KEXEC_KERNEL_CHECK", -#if !IS_ENABLED(CONFIG_MODULE_SIG) - "appraise func=MODULE_CHECK appraise_type=imasig", -#endif - "measure func=MODULE_CHECK", - NULL -}; - -const char * const *arch_get_ima_policy(void) -{ - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { - if (IS_ENABLED(CONFIG_MODULE_SIG)) - set_module_sig_enforced(); - return sb_arch_rules; - } - return NULL; -} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c5dd50369e2f..58aa712973ac 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -21,6 +21,7 @@ #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -227,7 +228,7 @@ static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) - run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); + generic_handle_irq_desc(desc); else __handle_irq(desc, regs); } @@ -374,3 +375,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0b79efc87be5..044902d5a3c4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -22,6 +22,7 @@ #include <asm/apic.h> #include <asm/nospec-branch.h> +#include <asm/softirq_stack.h> #ifdef CONFIG_DEBUG_STACKOVERFLOW diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 440eed558558..1c0fb96b9e39 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include <linux/sched/task_stack.h> #include <asm/cpu_entry_area.h> +#include <asm/softirq_stack.h> #include <asm/irq_stack.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -48,7 +49,8 @@ static int map_irq_stack(unsigned int cpu) if (!va) return -ENOMEM; - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -60,7 +62,8 @@ static int map_irq_stack(unsigned int cpu) { void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif @@ -71,8 +74,3 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; return map_irq_stack(cpu); } - -void do_softirq_own_stack(void) -{ - run_on_irqstack_cond(__do_softirq, NULL); -} diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 0db0375235b4..8ef35063964b 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl) ret SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) - -/* - * void native_restore_fl(unsigned long flags) - * %eax/%rdi: flags - */ -SYM_FUNC_START(native_restore_fl) - push %_ASM_ARG1 - popf - ret -SYM_FUNC_END(native_restore_fl) -EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 547c7abb39f5..df776cdca327 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to) NOKPROBE_SYMBOL(synthesize_relcall); /* - * Skip the prefixes of the instruction. - */ -static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) -{ - insn_attr_t attr; - - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - while (inat_is_legacy_prefix(attr)) { - insn++; - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - } -#ifdef CONFIG_X86_64 - if (inat_is_rex_prefix(attr)) - insn++; -#endif - return insn; -} -NOKPROBE_SYMBOL(skip_prefixes); - -/* * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ @@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr) } /* - * Returns non-zero if opcode modifies the interrupt flag. - */ -static int is_IF_modifier(kprobe_opcode_t *insn) -{ - /* Skip prefixes */ - insn = skip_prefixes(insn); - - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - return 0; -} - -/* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative * addressing mode. Note that since @real will be the final place of copied @@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); len += JMP32_INSN_SIZE; - p->ainsn.boostable = true; + p->ainsn.boostable = 1; } else { - p->ainsn.boostable = false; + p->ainsn.boostable = 0; } return len; @@ -450,6 +411,67 @@ void free_insn_page(void *page) module_memfree(page); } +static void set_resume_flags(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9d: /* popf/popfd */ + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = 1; + break; + case 0x9c: /* pushfl */ + p->ainsn.is_pushf = 1; + break; + case 0xcf: /* iret */ + p->ainsn.if_modifier = 1; + fallthrough; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + break; + case 0xe8: /* call relative - Fix return addr */ + p->ainsn.is_call = 1; + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; +#endif + case 0xff: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; + } else if (((opcode & 0x31) == 0x20) || + ((opcode & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. + */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + } + break; + } +} + static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; @@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p) */ len = prepare_boost(buf, p, &insn); - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(buf); + /* Analyze the opcode and set resume flags */ + set_resume_flags(p, &insn); /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; @@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p) if (!can_probe((unsigned long)p->addr)) return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler); * 2) If the single-stepped instruction was a call, the return address * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. - * - * If this is the first time we've single-stepped the instruction at - * this probepoint, and the instruction is boostable, boost it: add a - * jump instruction after the copied instruction, that jumps to the next - * instruction after the probepoint. */ static void resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) @@ -818,59 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; unsigned long orig_ip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /* Skip prefixes */ - insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; - switch (*insn) { - case 0x9c: /* pushfl */ + + /* Fixup the contents of top of stack */ + if (p->ainsn.is_pushf) { *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); *tos |= kcb->kprobe_old_flags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = true; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ + } else if (p->ainsn.is_call) { *tos = orig_ip + (*tos - copy_ip); - break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; -#endif - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || - ((insn[1] & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. And this is boostable - */ - p->ainsn.boostable = true; - goto no_change; - } - default: - break; } - regs->ip += orig_ip - copy_ip; + if (!p->ainsn.is_abs_ip) + regs->ip += orig_ip - copy_ip; -no_change: restore_btf(); } NOKPROBE_SYMBOL(resume_execution); @@ -937,6 +918,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * So clear it by resetting the current kprobe: */ regs->flags &= ~X86_EFLAGS_TF; + /* + * Since the single step (trap) has been cancelled, + * we need to restore BTF here. + */ + restore_btf(); /* * If the TF flag was set before the kprobe hit, diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 681a4b36e9bb..51c7f5271aee 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,17 +12,23 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct kprobe *p; struct kprobe_ctlblk *kcb; + int bit; - /* Preempt is disabled by ftrace */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7f57ede3cb8e..78bb0fae3982 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -740,6 +740,11 @@ static void __init kvm_apic_init(void) #endif } +static bool __init kvm_msi_ext_dest_id(void) +{ + return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); +} + static void __init kvm_init_platform(void) { kvmclock_init(); @@ -769,6 +774,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.msi_ext_dest_id = kvm_msi_ext_dest_id, .init.init_platform = kvm_init_platform, #if defined(CONFIG_AMD_MEM_ENCRYPT) .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, @@ -830,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 34b18f6eeb2c..1fc0962c89c0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -44,7 +44,6 @@ static int __init parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ -#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) @@ -269,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b8aee71840ae..aa15132228da 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm) if (!boot_cpu_has(X86_FEATURE_PTI)) return; - tlb_gather_mmu(&tlb, mm, start, end); + /* + * Although free_pgd_range() is intended for freeing user + * page-tables, it also works out for kernel mappings on x86. + * We use tlb_gather_mmu_fullmm() to avoid confusing the + * range-tracking logic in __tlb_adjust_range(). + */ + tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); #endif } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 34b153cbd4ac..5e9a34b5bd74 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: + case R_386_PLT32: /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c0d409810658..ed8ac6bcbafb 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -99,11 +99,9 @@ static int filter_write(u32 reg) if (!__ratelimit(&fw_rs)) return 0; - if (reg == MSR_IA32_ENERGY_PERF_BIAS) - return 0; - - pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to x86@kernel.org.\n", - reg, current->comm, current->pid); + pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n", + reg, current->comm, current->pid); + pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n"); return 0; } @@ -184,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = security_locked_down(LOCKDOWN_MSR); if (err) break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bc77aaf1303..bf250a339655 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - bool irq_state; + irqentry_state_t irq_state; /* * Re-enable NMIs right here when running as an SEV-ES guest. This might @@ -502,14 +502,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6c3407ba6ee9..c60222ab8ab9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insn_buff, len); - else if (type == PARAVIRT_PATCH(cpu.iret) || - type == PARAVIRT_PATCH(cpu.usergs_sysret64)) + else if (type == PARAVIRT_PATCH(cpu.iret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif @@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { .start = 0, @@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.usergs_sysret64 = native_usergs_sysret64, .cpu.iret = native_iret, - .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, @@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .irq.safe_halt = native_safe_halt, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index ace6e334cb39..abd27ec67397 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -25,10 +25,7 @@ struct patch_xxl { const unsigned char mmu_read_cr2[3]; const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; - const unsigned char irq_restore_fl[2]; const unsigned char cpu_wbinvd[2]; - const unsigned char cpu_usergs_sysret64[6]; - const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; }; @@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = { .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, - 0x48, 0x0f, 0x07 }, // swapgs; sysretq - .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax }; @@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, switch (type) { #ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); PATCH_CASE(irq, save_fl, xxl, insn_buff, len); PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); @@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); - PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); #endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 2e9006c1e240..42e92ec62973 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -4,9 +4,6 @@ #include <linux/string.h> #include <linux/kallsyms.h> - -#define DEBUG 1 - static struct iommu_table_entry * __init find_dependents_of(struct iommu_table_entry *start, struct iommu_table_entry *finish, diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index f9e5352b3bef..624703af80a1 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -122,7 +122,7 @@ int perf_reg_validate(u64 mask) u64 perf_reg_abi(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) return PERF_SAMPLE_REGS_ABI_32; else return PERF_SAMPLE_REGS_ABI_64; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 145a7ac0c19a..9c214d7085a4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, #endif /* Kernel thread ? */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index df342bedea88..d08307df69ad 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -511,11 +511,10 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT -void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, - test_thread_flag(TIF_X32) - ? __USER_CS : __USER32_CS, + x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif @@ -540,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(irq_count) != -1); + this_cpu_read(hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); @@ -641,16 +640,12 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; - - /* Ensure the corresponding mm is not marked. */ if (current->mm) - current->mm->context.ia32_compat = 0; + current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL; /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, @@ -662,10 +657,9 @@ void set_personality_64bit(void) static void __set_personality_x32(void) { #ifdef CONFIG_X86_X32 - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); if (current->mm) - current->mm->context.ia32_compat = TIF_X32; + current->mm->context.flags = 0; + current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit @@ -683,10 +677,14 @@ static void __set_personality_x32(void) static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; + if (current->mm) { + /* + * uprobes applied to this MM need to know this and + * cannot use user_64bit_mode() at that time. + */ + current->mm->context.flags = MM_CONTEXT_UPROBE_IA32; + } + current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bedca011459c..87a4143aa7d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index db115943e8bd..b29657b76e3f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, @@ -538,31 +547,21 @@ static void emergency_vmx_disable_all(void) local_irq_disable(); /* - * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignores INIT - * signals when VMX is enabled. - * - * We can't take any locks and we may be on an inconsistent - * state, so we use NMIs as IPIs to tell the other CPUs to disable - * VMX and halt. + * Disable VMX on all CPUs before rebooting, otherwise we risk hanging + * the machine, because the CPU blocks INIT when it's in VMX root. * - * For safety, we will avoid running the nmi_shootdown_cpus() - * stuff unnecessarily, but we don't have a way to check - * if other CPUs have VMX enabled. So we will call it only if the - * CPU we are running on has VMX enabled. + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. * - * We will miss cases where VMX is not enabled on all CPUs. This - * shouldn't do much harm because KVM always enable VMX on all - * CPUs anyway. But we can miss it on the small window where KVM - * is still enabling VMX. + * Do the NMI shootdown even if VMX if off on _this_ CPU, as that + * doesn't prevent a different CPU from being in VMX root operation. */ - if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. */ - cpu_vmxoff(); + if (cpu_has_vmx()) { + /* Safely force _this_ CPU out of VMX root operation. */ + __cpu_emergency_vmxoff(); - /* Halt and disable VMX on the other CPUs */ + /* Halt and exit VMX root operation on the other CPUs. */ nmi_shootdown_cpus(vmxoff_nmi); - } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84f581c91db4..d883176ef2ce 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -16,7 +16,6 @@ #include <linux/memblock.h> #include <linux/pci.h> #include <linux/root_dev.h> -#include <linux/sfi.h> #include <linux/hugetlb.h> #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> @@ -119,11 +118,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; -/* For MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; - struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -1054,6 +1048,12 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); + /* + * Needs to run after memblock setup because it needs the physical + * memory size. + */ + sev_setup_arch(); + reserve_bios_regions(); efi_fake_memmap(); @@ -1184,7 +1184,6 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); x86_dtb_init(); /* diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 7d04b356d44d..cdc04d091242 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -305,14 +305,14 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* IN register opcodes */ diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 0bd1a0fc587e..04a780abb512 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } @@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) + if (on_vc_stack(regs)) new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); else new_ist = old_ist - sizeof(old_ist); @@ -225,7 +235,7 @@ static inline u64 sev_es_rd_ghcb_msr(void) return __rdmsr(MSR_AMD64_SEV_ES_GHCB); } -static inline void sev_es_wr_ghcb_msr(u64 val) +static __always_inline void sev_es_wr_ghcb_msr(u64 val) { u32 low, high; @@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; @@ -286,6 +296,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(target, size)) { + memcpy(dst, buf, size); + return ES_OK; + } + switch (size) { case 1: memcpy(&d1, buf, 1); @@ -335,6 +351,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(s, size)) { + memcpy(buf, src, size); + return ES_OK; + } + switch (size) { case 1: if (get_user(d1, s)) @@ -1236,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1251,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1313,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index be0d7d4152ec..f306e85a08a6 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI @@ -804,11 +782,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; - if (get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a7f3e12cfbdb..a5330ff498f0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 1); + BUILD_BUG_ON(NSIGSYS != 2); /* This is part of the ABI and can never change in size: */ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); @@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { signal_compat_build_tests(); - /* Don't leak in-kernel non-uapi flags to user-space */ - if (oact) - oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (!act) return; - /* Don't let flags to be set from userspace */ - act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index de776b2e6046..02813a7f3a7c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> +#include <linux/syscore_ops.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -82,6 +83,10 @@ #include <asm/hw_irq.h> #include <asm/stackprotector.h> +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> +#endif + /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); @@ -148,7 +153,7 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } -static void init_freq_invariance(bool secondary); +static void init_freq_invariance(bool secondary, bool cppc_ready); /* * Report back to the Boot Processor during boot time or to the caller processor @@ -186,7 +191,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true); + init_freq_invariance(true, false); /* * Get our bogomips. @@ -229,6 +234,7 @@ static void notrace start_secondary(void *unused) #endif cpu_init_exception_handling(); cpu_init(); + rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -747,13 +753,14 @@ static void __init smp_quirk_init_udelay(void) int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { + u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; unsigned long send_status, accept_status = 0; int maxlvt; /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + apic_icr_write(APIC_DM_NMI | dm, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -980,10 +987,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, if (!boot_error) { enable_start_cpu0 = 1; *cpu0_nmi_registered = 1; - if (apic->dest_logical == APIC_DEST_LOGICAL) - id = cpu0_logical_apicid; - else - id = apicid; + id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } @@ -1340,7 +1344,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - init_freq_invariance(false); + init_freq_invariance(false, false); smp_sanity_check(); switch (apic_intr_mode) { @@ -1829,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool turbo_disabled(void) { @@ -2027,6 +2032,48 @@ out: return true; } +#ifdef CONFIG_ACPI_CPPC_LIB +static bool amd_set_max_freq_ratio(void) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + u64 perf_ratio; + int rc; + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { + pr_debug("Could not retrieve perf counters (%d)\n", rc); + return false; + } + + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return false; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); + return false; + } + + arch_turbo_freq_ratio = perf_ratio; + arch_set_max_freq_ratio(false); + + return true; +} +#else +static bool amd_set_max_freq_ratio(void) +{ + return false; +} +#endif + static void init_counter_refs(void) { u64 aperf, mperf; @@ -2038,7 +2085,24 @@ static void init_counter_refs(void) this_cpu_write(arch_prev_mperf, mperf); } -static void init_freq_invariance(bool secondary) +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) +{ + /* Bail out if registered already. */ + if (freq_invariance_syscore_ops.node.prev) + return; + + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif + +static void init_freq_invariance(bool secondary, bool cppc_ready) { bool ret = false; @@ -2054,15 +2118,39 @@ static void init_freq_invariance(bool secondary) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ret = intel_set_max_freq_ratio(); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (!cppc_ready) { + return; + } + ret = amd_set_max_freq_ratio(); + } if (ret) { init_counter_refs(); static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } else { pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); } } +#ifdef CONFIG_ACPI_CPPC_LIB +static DEFINE_MUTEX(freq_invariance_lock); + +void init_freq_invariance_cppc(void) +{ + static bool secondary; + + mutex_lock(&freq_invariance_lock); + + init_freq_invariance(secondary, true); + secondary = true; + + mutex_unlock(&freq_invariance_lock); +} +#endif + static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); @@ -2112,7 +2200,7 @@ error: schedule_work(&disable_freq_invariance_work); } #else -static inline void init_freq_invariance(bool secondary) +static inline void init_freq_invariance(bool secondary, bool cppc_ready) { } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ca9a380d9c0b..9442c4136c38 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -11,14 +11,26 @@ enum insn_type { RET = 3, /* tramp / site cond-tail-call */ }; +/* + * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax + * The REX.W cancels the effect of any data16. + */ +static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) { + const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; switch (type) { case CALL: code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + break; case NOP: @@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void if (unlikely(system_state == SYSTEM_BOOTING)) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, NULL); + text_poke_bp(insn, code, size, emulate); } static void __static_call_validate(void *insn, bool tail) @@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, xor5rax, 5)) return; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..0f3c307b37b3 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child) regs->flags |= X86_EFLAGS_TF; /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also + * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + oflags = regs->flags; /* Set TF on the kernel stack.. */ @@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child) /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 504fa5425bce..660b78827638 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ae64f98ec2ab..4c09ba110204 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0a2ec801b63f..f5477eab5692 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,6 +25,7 @@ * * Send feedback to <colpatch@us.ibm.com> */ +#include <linux/interrupt.h> #include <linux/nodemask.h> #include <linux/export.h> #include <linux/mmzone.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e19df6cde35d..ac1874a2a70e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/vdso.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -117,6 +118,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); + } else { + if (fixup_vdso_exception(regs, trapnr, error_code, 0)) + return 0; } /* @@ -299,11 +303,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) local_irq_enable(); if (handle_user_split_lock(regs, error_code)) - return; + goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); +out: local_irq_disable(); } @@ -405,7 +410,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - idtentry_enter_nmi(regs); + irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -550,6 +555,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; + if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) + return; + show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); goto exit; @@ -651,12 +659,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } } @@ -685,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } @@ -851,7 +859,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* @@ -908,7 +916,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } @@ -926,7 +934,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* * NB: We can't easily clear DR7 here because - * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be @@ -1048,6 +1056,9 @@ static void math_error(struct pt_regs *regs, int trapnr) if (!si_code) goto exit; + if (fixup_vdso_exception(regs, trapnr, 0, 0)) + return; + force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 73f800100066..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } @@ -471,7 +471,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_SP_INDIRECT: - sp = state->sp + orc->sp_offset; + sp = state->sp; indirect = true; break; @@ -521,6 +521,9 @@ bool unwind_next_frame(struct unwind_state *state) if (indirect) { if (!deref_stack_reg(state, sp, &sp)) goto err; + + if (orc->sp_reg == ORC_REG_SP_INDIRECT) + sp += orc->sp_offset; } /* Find IP, SP and possibly regs: */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 138bdb1fd136..a2b413394917 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1017,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; + break; + default: break; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 764573de3996..e5a7a10a0164 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ user_access_end(); @@ -160,49 +164,6 @@ Efault: do_exit(SIGSEGV); } -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - mmap_write_lock(mm); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); - - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - mmap_write_unlock(mm); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); -} - - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); vm86regs.pt.bx = v.regs.ebx; @@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86regs.gs = v.regs.gs; vm86->flags = v.flags; - vm86->screen_bitmap = v.screen_bitmap; vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, @@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf9e0adb5b7e..efd9e9ea17f2 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -454,13 +454,13 @@ SECTIONS ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") } -#ifdef CONFIG_X86_32 /* * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); -#else + +#ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. @@ -470,18 +470,12 @@ INIT_PER_CPU(gdt_page); INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(irq_stack_backing_store); -/* - * Build-time check on the image size: - */ -. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); - #ifdef CONFIG_SMP . = ASSERT((fixed_percpu_data == 0), "fixed_percpu_data is not at start of per-cpu area"); #endif -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_KEXEC_CORE #include <asm/kexec.h> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a3038d8deb6a..8b395821cb8d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -110,6 +110,7 @@ struct x86_init_ops x86_init __initdata = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, + .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f92dfd8ef10d..a788d5120d4d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -100,7 +100,17 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) help - Provides support for launching Encrypted VMs on AMD processors. + Provides support for launching Encrypted VMs (SEV) and Encrypted VMs + with Encrypted State (SEV-ES) on AMD processors. + +config KVM_XEN + bool "Support for Xen hypercall interface" + depends on KVM + help + Provides KVM support for the hosting Xen HVM guests and + passing Xen hypercalls to userspace. + + If in doubt, say "N". config KVM_MMU_AUDIT bool "Audit KVM MMU" diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b804444e16d4..eafc4d601f25 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y += -Iarch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror ifeq ($(CONFIG_FRAME_POINTER),y) @@ -10,13 +10,16 @@ endif KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ + $(KVM)/dirty_ring.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o + mmu/spte.o +kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_XEN) += xen.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 83637a2ff605..6bd2f8b830e4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -146,6 +146,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } } +EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { @@ -172,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_mmu_reset_context(vcpu); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + kvm_hv_set_cpuid(vcpu); /* Invoke the vendor callback only after the above state is updated. */ - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); + + /* + * Except for the MMU, which needs to be reset after any vendor + * specific adjustments to the reserved GPA bits. + */ + kvm_mmu_reset_context(vcpu); } static int is_efer_nx(void) @@ -222,6 +229,16 @@ not_found: return 36; } +/* + * This "raw" version returns the reserved GPA bits without any adjustments for + * encryption technologies that usurp bits. The raw mask should be used if and + * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. + */ +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -320,7 +337,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; @@ -391,7 +408,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_7_0_EBX, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | + F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ @@ -418,7 +435,7 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) + F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -433,7 +450,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX512_BF16) + F(AVX_VNNI) | F(AVX512_BF16) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f7a6e8f83783..2a0c5064497f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return !(gpa & vcpu->arch.reserved_gpa_bits); +} + static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); + return !kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t alignment) +{ + return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } struct cpuid_reg { @@ -264,6 +281,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); +} + static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; @@ -310,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } -static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56cae1ff9e3f..f7970ba6219f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, val = GET_SMSTATE(u32, smstate, 0x7fcc); - if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 6, val)) return X86EMUL_UNHANDLEABLE; val = GET_SMSTATE(u32, smstate, 0x7fc8); - if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 7, val)) return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7fc4); @@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smstate, 0x7f68); + val = GET_SMSTATE(u64, smstate, 0x7f68); - if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 6, val)) return X86EMUL_UNHANDLEABLE; - val = GET_SMSTATE(u32, smstate, 0x7f60); + val = GET_SMSTATE(u64, smstate, 0x7f60); - if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 7, val)) return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); @@ -2879,6 +2879,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data : (u32)msr_data; + if (efer & EFER_LMA) + ctxt->mode = X86EMUL_MODE_PROT64; return X86EMUL_CONTINUE; } @@ -4327,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_dr(ctxt, 6, &dr6); dr6 &= ~DR_TRAP_BITS; - dr6 |= DR6_BD | DR6_RTM; + dr6 |= DR6_BD | DR6_ACTIVE_LOW; ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5c7c4060b45c..f98370a39936 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" +#include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> @@ -36,6 +37,9 @@ #include "trace.h" #include "irq.h" +/* "Hv#1" signature */ +#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 + #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, @@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ - kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); + kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } @@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); - if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } @@ -155,17 +159,17 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); - if (!vcpu) + if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; - synic = vcpu_to_synic(vcpu); + synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; @@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; @@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && !host) @@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = @@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; @@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, - vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; @@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; @@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) break; } - trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, - vcpu_to_hv_vcpu(vcpu)->vp_index, msr, - *pdata); + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } @@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; @@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); @@ -514,15 +515,15 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* - * The guest has not set up the TSC page or the clock isn't - * stable, fall back to get_kvmclock_ns. + * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, + * is broken, disabled or being updated. */ - if (!hv->tsc_ref.tsc_sequence) + if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); @@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); @@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); - trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } @@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); - trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); @@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) u64 time_now; ktime_t ktime_now; - time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); + time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { @@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( - stimer_to_vcpu(stimer)->vcpu_id, + hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); @@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) return 0; } - trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); @@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; - trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); @@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; - trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); @@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; @@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; @@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); - return synic_deliver_msg(vcpu_to_synic(vcpu), + return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector @@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); - trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; @@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; @@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); + + kfree(hv_vcpu); + vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } @@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) stimer_prepare_msg(stimer); } -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; int i; + hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); + if (!hv_vcpu) + return -ENOMEM; + + vcpu->arch.hyperv = hv_vcpu; + hv_vcpu->vcpu = vcpu; + synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); -} - -void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + + return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic; + int r; + + if (!to_hv_vcpu(vcpu)) { + r = kvm_hv_vcpu_init(vcpu); + if (r) + return r; + } + + synic = to_hv_synic(vcpu); /* * Hyper-V SynIC auto EOI SINT's are @@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr) return r; } -static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 *pdata) +static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) @@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, return 0; } -static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } -static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (host) - hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; + struct kvm_hv *hv = to_kvm_hv(kvm); - if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { - - vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", - hv->hv_crash_param[0], - hv->hv_crash_param[1], - hv->hv_crash_param[2], - hv->hv_crash_param[3], - hv->hv_crash_param[4]); - - /* Send notification about crash to user space */ - kvm_make_request(KVM_REQ_HV_CRASH, vcpu); - } + hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } -static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 data) +static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) @@ -1065,20 +1077,36 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, return true; } +/* + * Don't touch TSC page values if the guest has opted for TSC emulation after + * migration. KVM doesn't fully support reenlightenment notifications and TSC + * access emulation and Hyper-V is known to expect the values in TSC page to + * stay constant before TSC access emulation is disabled from guest side + * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC + * frequency and guest visible TSC value across migration (and prevent it when + * TSC scaling is unsupported). + */ +static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) +{ + return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && + hv->hv_tsc_emulation_control; +} + void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; - mutex_lock(&kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) goto out_unlock; @@ -1089,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) + goto out_err; + + if (tsc_seq && tsc_page_update_unsafe(hv)) { + if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; + } /* * While we're computing and writing the parameters, force the @@ -1098,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - goto out_unlock; + goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - goto out_unlock; + goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - goto out_unlock; + goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -1119,17 +1155,54 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; - kvm_write_guest(kvm, gfn_to_gpa(gfn), - &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + goto out_err; + + hv->hv_tsc_page_status = HV_TSC_PAGE_SET; + goto out_unlock; + +out_err: + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; out_unlock: - mutex_unlock(&kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); +} + +void kvm_hv_invalidate_tsc_page(struct kvm *kvm) +{ + struct kvm_hv *hv = to_kvm_hv(kvm); + u64 gfn; + + if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || + hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || + tsc_page_update_unsafe(hv)) + return; + + mutex_lock(&hv->hv_lock); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + + /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */ + if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET) + hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + +out_unlock: + mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -1139,9 +1212,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; + u8 instructions[9]; + int i = 0; + u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) @@ -1150,29 +1223,67 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall = data; break; } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops.patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) + + /* + * If Xen and Hyper-V hypercalls are both enabled, disambiguate + * the same way Xen itself does, by setting the bit 31 of EAX + * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just + * going to be clobbered on 64-bit. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + /* orl $0x80000000, %eax */ + instructions[i++] = 0x0d; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x80; + } + + /* vmcall/vmmcall */ + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + i += 3; + + /* ret */ + ((unsigned char *)instructions)[i++] = 0xc3; + + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; - mark_page_dirty(kvm, gfn); break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { + if (!host) + hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; + else + hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } else { + hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; + } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_set_crash_data(vcpu, + return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + if (host) + return kvm_hv_msr_set_crash_ctl(kvm, data); + + if (data & HV_CRASH_CTL_CRASH_NOTIFY) { + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); @@ -1186,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: + if (data && !host) + return 1; + hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: @@ -1216,11 +1330,11 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); switch (msr) { case HV_X64_MSR_VP_INDEX: { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); int vcpu_idx = kvm_vcpu_get_idx(vcpu); u32 new_vp_index = (u32)data; @@ -1291,14 +1405,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); + return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: @@ -1307,7 +1421,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: @@ -1330,7 +1444,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, { u64 data = 0; struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -1346,11 +1460,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_get_crash_data(vcpu, + return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; @@ -1379,7 +1493,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); switch (msr) { case HV_X64_MSR_VP_INDEX: @@ -1403,14 +1517,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); + return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: @@ -1419,7 +1533,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: @@ -1438,12 +1552,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (!to_hv_vcpu(vcpu)) { + if (kvm_hv_vcpu_init(vcpu)) + return 1; + } + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1451,12 +1575,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (!to_hv_vcpu(vcpu)) { + if (kvm_hv_vcpu_init(vcpu)) + return 1; + } + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); @@ -1466,7 +1600,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, u64 *vp_bitmap, unsigned long *vcpu_bitmap) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; int i, bank, sbank = 0; @@ -1483,18 +1617,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { - if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, - (unsigned long *)vp_bitmap)) + if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_bitmap); } return vcpu_bitmap; } -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, - u16 rep_cnt, bool ex) +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex) { - struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1592,10 +1724,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, bool ex, bool fast) { - struct kvm *kvm = current_vcpu->kvm; + struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1666,9 +1798,20 @@ ret_success: return HV_STATUS_SUCCESS; } -bool kvm_hv_hypercall_enabled(struct kvm *kvm) +void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { - return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0; + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) + vcpu->arch.hyperv_enabled = true; + else + vcpu->arch.hyperv_enabled = false; +} + +bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) @@ -1698,6 +1841,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!fast)) { @@ -1726,7 +1870,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); - eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + eventfd = idr_find(&hv->conn_to_evt, param); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; @@ -1745,7 +1889,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -1793,7 +1937,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { + if (unlikely(rep || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -1855,7 +1999,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; @@ -1885,23 +2029,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) void kvm_hv_init_vm(struct kvm *kvm) { - mutex_init(&kvm->arch.hyperv.hv_lock); - idr_init(&kvm->arch.hyperv.conn_to_evt); + struct kvm_hv *hv = to_kvm_hv(kvm); + + mutex_init(&hv->hv_lock); + idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; - idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i) + idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); - idr_destroy(&kvm->arch.hyperv.conn_to_evt); + idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; @@ -1925,7 +2072,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); @@ -1951,8 +2098,8 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } -int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { @@ -1997,8 +2144,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; case HYPERV_CPUID_INTERFACE: - memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); - ent->eax = signature[0]; + ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: @@ -2037,7 +2183,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, * Direct Synthetic timers only make sense with in-kernel * LAPIC */ - if (lapic_in_kernel(vcpu)) + if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e68c6c2e9649..60547d5cb6d7 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -50,38 +50,46 @@ /* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */ #define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2) -static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) +static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm) { - return &vcpu->arch.hyperv; + return &kvm->arch.hyperv; } -static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) +static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_arch *arch; - - arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); - return container_of(arch, struct kvm_vcpu, arch); + return vcpu->arch.hyperv; } -static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) +static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu) { - return &vcpu->arch.hyperv.synic; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return &hv_vcpu->synic; } -static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) +static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) { - return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); + struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic); + + return hv_vcpu->vcpu; } -static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu) { return &vcpu->kvm->arch.hyperv.hv_syndbg; } +static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); -bool kvm_hv_hypercall_enabled(struct kvm *kvm); +bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); @@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, struct hv_vp_assist_page *assist_page); -static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, - int timer_index) +static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu, + int timer_index) { - return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; + return &to_hv_vcpu(vcpu)->stimer[timer_index]; } -static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) +static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu_hv *hv_vcpu; hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, stimer[0]); - return hv_vcpu_to_vcpu(hv_vcpu); + return hv_vcpu->vcpu; } static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) { - return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + return !bitmap_empty(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); } @@ -122,11 +133,13 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); +void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); +void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); -int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 814698e5b152..172b05343cfd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -14,6 +14,7 @@ #include "irq.h" #include "i8254.h" #include "x86.h" +#include "xen.h" /* * check if there are pending timer events @@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.injected; + if (kvm_xen_has_interrupt(v)) + return 1; + if (!kvm_apic_accept_pic_intr(v)) return 0; @@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; + if (kvm_xen_has_interrupt(v)) + return v->kvm->arch.xen.upcall_vector; + if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; @@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); - if (kvm_x86_ops.migrate_timers) - kvm_x86_ops.migrate_timers(vcpu); + static_call_cond(kvm_x86_migrate_timers)(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 4aa1c2e00e2a..8a4de3f12820 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -16,8 +16,6 @@ #include <trace/events/kvm.h> -#include <asm/msidef.h> - #include "irq.h" #include "ioapic.h" @@ -104,22 +102,19 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? - (u64)e->msi.address_hi << 32 : 0), - e->msi.data); - - irq->dest_id = (e->msi.address_lo & - MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; - if (kvm->arch.x2apic_format) - irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); - irq->vector = (e->msi.data & - MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = kvm_lapic_irq_dest_mode( - !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); - irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; - irq->delivery_mode = e->msi.data & 0x700; - irq->msi_redir_hint = ((e->msi.address_lo - & MSI_ADDR_REDIRECTION_LOWPRI) > 0); + struct msi_msg msg = { .address_lo = e->msi.address_lo, + .address_hi = e->msi.address_hi, + .data = e->msi.data }; + + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? + (u64)msg.address_hi << 32 : 0), msg.data); + + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); + irq->vector = msg.arch_data.vector; + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); + irq->trig_mode = msg.arch_data.is_level; + irq->delivery_mode = msg.arch_data.delivery_mode << 8; + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; irq->level = 1; irq->shorthand = APIC_DEST_NOSHORT; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index a889563ad02d..2e11da2f5621 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) return 0; if (!kvm_register_is_available(vcpu, reg)) - kvm_x86_ops.cache_reg(vcpu, reg); + static_call(kvm_x86_cache_reg)(vcpu, reg); return vcpu->arch.regs[reg]; } @@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -118,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -132,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 43c93ffa76ed..0d359115429a 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -205,7 +205,7 @@ struct x86_emulate_ops { ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); - int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 86c33d53c90a..cc369b9ad8f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } -struct static_key_deferred apic_hw_disabled __read_mostly; -struct static_key_deferred apic_sw_disabled __read_mostly; +__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); +__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); static inline int apic_enabled(struct kvm_lapic *apic) { @@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); + static_branch_slow_dec_deferred(&apic_sw_disabled); else - static_key_slow_inc(&apic_sw_disabled.key); + static_branch_inc(&apic_sw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } @@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_x86_ops.hwapic_irr_update(vcpu, + static_call(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; @@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops.hwapic_isr_update(vcpu, vec); + static_call(kvm_x86_hwapic_isr_update)(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops.hwapic_isr_update(vcpu, - apic_find_highest_isr(apic)); + static_call(kvm_x86_hwapic_isr_update)(vcpu, + apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -674,7 +674,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) (unsigned long long)vcpu->arch.pv_eoi.msr_val); return false; } - return val & 0x1; + return val & KVM_PV_EOI_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) @@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (apic->vcpu->arch.apicv_active) - highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu); + highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) @@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) { + if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) { kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) + if (to_hv_vcpu(apic->vcpu) && + test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); @@ -1641,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -1814,7 +1824,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - kvm_x86_ops.cancel_hv_timer(apic->vcpu); + static_call(kvm_x86_cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } @@ -1831,7 +1841,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!ktimer->tscdeadline) return false; - if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) + if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; @@ -2175,10 +2185,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) - static_key_slow_dec_deferred(&apic_hw_disabled); + static_branch_slow_dec_deferred(&apic_hw_disabled); if (!apic->sw_enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); + static_branch_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) free_page((unsigned long)apic->regs); @@ -2250,9 +2260,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - static_key_slow_dec_deferred(&apic_hw_disabled); + static_branch_slow_dec_deferred(&apic_hw_disabled); } else { - static_key_slow_inc(&apic_hw_disabled.key); + static_branch_inc(&apic_hw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } @@ -2261,7 +2271,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - kvm_x86_ops.set_virtual_apic_mode(vcpu); + static_call(kvm_x86_set_virtual_apic_mode)(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2338,9 +2348,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - kvm_x86_ops.apicv_post_state_restore(vcpu); - kvm_x86_ops.hwapic_irr_update(vcpu, -1); - kvm_x86_ops.hwapic_isr_update(vcpu, -1); + static_call(kvm_x86_apicv_post_state_restore)(vcpu); + static_call(kvm_x86_hwapic_irr_update)(vcpu, -1); + static_call(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2449,7 +2459,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) * thinking that APIC state has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ + static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2512,7 +2522,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) */ apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { + if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another @@ -2594,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); @@ -2601,10 +2612,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - kvm_x86_ops.apicv_post_state_restore(vcpu); - kvm_x86_ops.hwapic_irr_update(vcpu, + static_call(kvm_x86_apicv_post_state_restore)(vcpu); + static_call(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops.hwapic_isr_update(vcpu, + static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -2843,14 +2854,35 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; + int r; unsigned long pe; - if (!lapic_in_kernel(vcpu) || !apic->pending_events) + if (!lapic_in_kernel(vcpu)) return; /* + * Read pending events before calling the check_events + * callback. + */ + pe = smp_load_acquire(&apic->pending_events); + if (!pe) + return; + + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); + if (r < 0) + return; + /* + * If an event has happened and caused a vmexit, + * we know INITs are latched and therefore + * we will not incorrectly deliver an APIC + * event instead of a vmexit. + */ + } + + /* * INITs are latched while CPU is in specific states - * (SMM, VMX non-root mode, SVM with GIF=0). + * (SMM, VMX root mode, SVM with GIF=0). * Because a CPU cannot be in these states immediately * after it has processed an INIT signal (and thus in * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs @@ -2858,36 +2890,31 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) */ if (kvm_vcpu_latch_init(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); - if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) + if (test_bit(KVM_APIC_SIPI, &pe)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); return; } - pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { + clear_bit(KVM_APIC_INIT, &apic->pending_events); kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; } - if (test_bit(KVM_APIC_SIPI, &pe) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - /* evaluate pending_events before reading the vector */ - smp_rmb(); - sipi_vector = apic->sipi_vector; - kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (test_bit(KVM_APIC_SIPI, &pe)) { + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + /* evaluate pending_events before reading the vector */ + smp_rmb(); + sipi_vector = apic->sipi_vector; + kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } } } -void kvm_lapic_init(void) -{ - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&apic_hw_disabled, HZ); - jump_label_rate_limit(&apic_sw_disabled, HZ); -} - void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4fb86e3a9dd3..997c45a5963a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -6,6 +6,8 @@ #include <linux/kvm_host.h> +#include "hyperv.h" + #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 @@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; -} - int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); -void kvm_lapic_init(void); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) @@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va __kvm_lapic_set_reg(apic->regs, reg_off, val); } -extern struct static_key kvm_no_apic_vcpu; +DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { - if (static_key_false(&kvm_no_apic_vcpu)) + if (static_branch_unlikely(&kvm_has_noapic_vcpu)) return vcpu->arch.apic; return true; } -extern struct static_key_deferred apic_hw_disabled; +extern struct static_key_false_deferred apic_hw_disabled; static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_hw_disabled.key)) + if (static_branch_unlikely(&apic_hw_disabled.key)) return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; return MSR_IA32_APICBASE_ENABLE; } -extern struct static_key_deferred apic_sw_disabled; +extern struct static_key_false_deferred apic_sw_disabled; static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_sw_disabled.key)) + if (static_branch_unlikely(&apic_sw_disabled.key)) return apic->sw_enabled; return true; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9c4a9c8e43d9..c68bfc3e2402 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,12 +44,19 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -static inline u64 rsvd_bits(int s, int e) +static __always_inline u64 rsvd_bits(int s, int e) { + BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); + + if (__builtin_constant_p(e)) + BUILD_BUG_ON(e > 63); + else + e &= 63; + if (e < s) return 0; - return ((1ULL << (e - s + 1)) - 1) << s; + return ((2ULL << (e - s)) - 1) << s; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); @@ -95,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu), + static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu), vcpu->arch.mmu->shadow_root_level); } @@ -145,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -167,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, unsigned pfec) { - int cpl = kvm_x86_ops.get_cpl(vcpu); - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + int cpl = static_call(kvm_x86_get_cpl)(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); /* * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7a6ae9e90bd7..486aa94ecf1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, int ret = -ENOTSUPP; if (range && kvm_x86_ops.tlb_remote_flush_with_range) - ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range); + ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); if (ret) kvm_flush_remote_tlbs(kvm); @@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; - if (no_dirty_log && slot->dirty_bitmap) + if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL; return slot; @@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, int i, count = 0; if (!rmap_head->val) { - rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); + rmap_printk("%p %llx 0->1\n", spte, *spte); rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { - rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); + rmap_printk("%p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; rmap_head->val = (unsigned long)desc | 1; ++count; } else { - rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); + rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc->sptes[PTE_LIST_EXT-1]) { count += PTE_LIST_EXT; @@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) pr_err("%s: %p 0->BUG\n", __func__, spte); BUG(); } else if (!(rmap_head->val & 1)) { - rmap_printk("%s: %p 1->0\n", __func__, spte); + rmap_printk("%p 1->0\n", spte); if ((u64 *)rmap_head->val != spte) { pr_err("%s: %p 1->BUG\n", __func__, spte); BUG(); } rmap_head->val = 0; } else { - rmap_printk("%s: %p many->many\n", __func__, spte); + rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { @@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + rmap_printk("spte %p %llx\n", sptep, *sptep); if (pt_protect) spte &= ~SPTE_MMU_WRITEABLE; @@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + rmap_printk("spte %p %llx\n", sptep, *sptep); MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; @@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ -static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1180,35 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) return flush; } -static bool spte_set_dirty(u64 *sptep) -{ - u64 spte = *sptep; - - rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); - - /* - * Similar to the !kvm_x86_ops.slot_disable_log_dirty case, - * do not bother adding back write access to pages marked - * SPTE_AD_WRPROT_ONLY_MASK. - */ - spte |= shadow_dirty_mask; - - return mmu_spte_update(sptep, spte); -} - -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) -{ - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_set_dirty(sptep); - - return flush; -} - /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -1225,7 +1197,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); while (mask) { @@ -1248,25 +1220,24 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, * * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. */ -void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) +static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_clear_dirty(kvm, rmap_head); + __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; } } -EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); /** * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected @@ -1282,13 +1253,17 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - if (kvm_x86_ops.enable_log_dirty_pt_masked) - kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset, - mask); + if (kvm_x86_ops.cpu_dirty_log_size) + kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +int kvm_cpu_dirty_log_size(void) +{ + return kvm_x86_ops.cpu_dirty_log_size; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -1301,7 +1276,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); @@ -1316,14 +1291,15 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; bool flush = false; while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); + rmap_printk("spte %p %llx.\n", sptep, *sptep); pte_list_remove(rmap_head, sptep); flush = true; @@ -1336,7 +1312,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmap_head); + return kvm_zap_rmapp(kvm, rmap_head, slot); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1355,7 +1331,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", + rmap_printk("spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); need_flush = 1; @@ -1448,16 +1424,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +static __always_inline int +kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, + gfn_t gfn, + int level, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1513,7 +1490,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); return r; @@ -1525,7 +1502,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); return r; @@ -1580,7 +1557,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int young = false; young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); return young; @@ -1591,7 +1568,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) int young = false; young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_hva(kvm, hva); return young; @@ -1715,13 +1692,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_update_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *pte) -{ - WARN_ON(1); -} - #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2008,9 +1978,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2409,7 +2379,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, return 0; restart: - list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload. @@ -2462,7 +2432,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - @@ -2473,7 +2443,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -2484,7 +2454,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2492,11 +2462,25 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); + + return r; +} + +static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; + + if (vcpu->arch.mmu->direct_map) + return 0; + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -2750,11 +2734,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn, struct kvm_memory_slot *slot) +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2773,19 +2764,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + pte = lookup_address_in_mm(kvm->mm, hva, &level); if (unlikely(!pte)) return PG_LEVEL_4K; return level; } +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level) +{ + struct kvm_lpage_info *linfo; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) + break; + } + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + return host_pfn_mapping_level(kvm, gfn, pfn, slot); +} + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -2802,17 +2810,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_huge_page_level); - for ( ; max_level > PG_LEVEL_4K; max_level--) { - linfo = lpage_info_slot(gfn, slot, max_level); - if (!linfo->disallow_lpage) - break; - } - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); + level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); if (level == PG_LEVEL_4K) return level; @@ -3153,7 +3151,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); if (kvm_mmu_put_root(kvm, sp)) { - if (sp->tdp_mmu_page) + if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_free_root(kvm, sp); else if (sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3184,7 +3182,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3207,7 +3205,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3228,16 +3226,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu)) { - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return INVALID_PAGE; } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } @@ -3247,7 +3245,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (vcpu->kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); if (!VALID_PAGE(root)) @@ -3408,17 +3406,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) !smp_load_acquire(&sp->unsync_children)) return; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); mmu_sync_children(vcpu, sp); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { @@ -3432,9 +3430,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } -EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) @@ -3485,26 +3482,25 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; - int leaf = vcpu->arch.mmu->root_level; + int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr); + for (shadow_walk_init(&iterator, vcpu, addr), + *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); - sptes[leaf - 1] = spte; + sptes[leaf] = spte; if (!is_shadow_present_pte(spte)) break; - } walk_shadow_page_lockless_end(vcpu); @@ -3512,14 +3508,12 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) return leaf; } -/* return true if reserved bit is detected on spte. */ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { - u64 sptes[PT64_ROOT_MAX_LEVEL]; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; - int root = vcpu->arch.mmu->shadow_root_level; - int leaf; - int level; + int root, leaf, level; bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { @@ -3528,35 +3522,45 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) } if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else - leaf = get_walk(vcpu, addr, sptes); + leaf = get_walk(vcpu, addr, sptes, &root); + + if (unlikely(leaf < 0)) { + *sptep = 0ull; + return reserved; + } + + *sptep = sptes[leaf]; + + /* + * Skip reserved bits checks on the terminal leaf if it's not a valid + * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by + * design, always have reserved bits set. The purpose of the checks is + * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. + */ + if (!is_shadow_present_pte(sptes[leaf])) + leaf++; rsvd_check = &vcpu->arch.mmu->shadow_zero_check; - for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level - 1])) - break; + for (level = root; level >= leaf; level--) /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | - __is_rsvd_bits_set(rsvd_check, sptes[level - 1], - level); - } + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | + __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[level - 1], level); + sptes[level], level); } - *sptep = sptes[leaf - 1]; - return reserved; } @@ -3643,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, + bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3657,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) return false; /* *pfn has correct page already */ @@ -3671,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; } @@ -3684,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; + hva_t hva; int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) @@ -3702,15 +3709,21 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_lock(&vcpu->kvm->mmu_lock); + else + write_lock(&vcpu->kvm->mmu_lock); + + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) @@ -3724,7 +3737,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_unlock(&vcpu->kvm->mmu_lock); + else + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -3798,7 +3814,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; @@ -3969,20 +3984,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, static void __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, int level, bool nx, bool gbpages, + u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { - u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; + u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; - if (!nx) - exb_bit_rsvd = rsvd_bits(63, 63); if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); + if (level == PT32E_ROOT_LEVEL) + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); + else + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + + /* Note, NX doesn't exist in PDPTEs, this is handled below. */ + if (!nx) + high_bits_rsvd |= rsvd_bits(63, 63); + /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. @@ -4011,45 +4033,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 63) | - rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PDE */ - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PTE */ - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | + high_bits_rsvd | + rsvd_bits(5, 8) | + rsvd_bits(1, 2); /* PDPTE */ + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: - rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: - rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | + gbpages_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 29); - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | + gbpages_bit_rsvd | + rsvd_bits(13, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; @@ -4060,8 +4076,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), context->root_level, - context->nx, + vcpu->arch.reserved_gpa_bits, + context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4069,27 +4085,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, bool execonly) + u64 pa_bits_rsvd, bool execonly) { + u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 bad_mt_xwr; - rsvd_check->rsvd_bits_mask[0][4] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][3] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4108,7 +4119,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), execonly); + vcpu->arch.reserved_gpa_bits, execonly); +} + +static inline u64 reserved_hpa_bits(void) +{ + return rsvd_bits(shadow_phys_bits, 63); } /* @@ -4130,7 +4146,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4167,14 +4183,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - shadow_phys_bits, - false); + reserved_hpa_bits(), false); if (!shadow_me_mask) return; @@ -4194,7 +4209,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - shadow_phys_bits, execonly); + reserved_hpa_bits(), execonly); } #define BYTE_MASK(access) \ @@ -4380,7 +4395,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->update_pte = paging64_update_pte; context->shadow_root_level = level; context->direct_map = false; } @@ -4409,7 +4423,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } @@ -4491,7 +4504,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; @@ -4663,7 +4675,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; - context->update_pte = ept_update_pte; context->root_level = level; context->direct_map = false; context->mmu_role.as_u64 = new_role.as_u64; @@ -4796,7 +4807,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush_current(vcpu); + static_call(kvm_x86_tlb_flush_current)(vcpu); out: return r; } @@ -4811,19 +4822,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_unload); -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *new) -{ - if (sp->role.level != PG_LEVEL_4K) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu->update_pte(vcpu, sp, spte, new); -} - static bool need_remote_flush(u64 old, u64 new) { if (!is_shadow_present_pte(old)) @@ -4939,22 +4937,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -/* - * Ignore various flags when determining if a SPTE can be immediately - * overwritten for the current MMU. - * - level: explicitly checked in mmu_pte_write_new_pte(), and will never - * match the current MMU role, as MMU's level tracks the root level. - * - access: updated based on the new guest PTE - * - quadrant: handled by get_written_sptes() - * - invalid: always false (loop only walks valid shadow pages) - */ -static const union kvm_mmu_page_role role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .invalid = 0x1, -}; - static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, struct kvm_page_track_notifier_node *node) @@ -4984,7 +4966,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu, true); - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5005,14 +4987,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, local_flush = true; while (npte--) { - u32 base_role = vcpu->arch.mmu->mmu_role.base.word; - entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); - if (gentry && - !((sp->role.word ^ base_role) & ~role_ign.word) && - rmap_can_add(vcpu)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + if (gentry && sp->role.level != PG_LEVEL_4K) + ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -5020,24 +4998,8 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); - spin_unlock(&vcpu->kvm->mmu_lock); -} - -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->direct_map) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return r; + write_unlock(&vcpu->kvm->mmu_lock); } -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) @@ -5110,7 +5072,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - kvm_x86_ops.tlb_flush_gva(vcpu, gva); + static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5137,7 +5099,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->invlpg(vcpu, gva, root_hpa); } } -EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { @@ -5167,7 +5128,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - kvm_x86_ops.tlb_flush_gva(vcpu, gva); + static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); ++vcpu->stat.invlpg; @@ -5177,7 +5138,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) * for them. */ } -EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, int tdp_huge_page_level) @@ -5202,7 +5162,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool @@ -5216,16 +5177,16 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + flush |= fn(kvm, iterator.rmap, memslot); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } } @@ -5250,22 +5211,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { @@ -5375,7 +5320,7 @@ restart: * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -5408,7 +5353,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* @@ -5432,10 +5377,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5477,7 +5422,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5495,17 +5440,18 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } - if (kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(kvm)) { flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); if (flush) kvm_flush_remote_tlbs(kvm); } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } @@ -5516,12 +5462,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * We can flush all the TLBs out of the mmu lock without TLB @@ -5539,7 +5485,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5560,8 +5507,8 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - (kvm_is_zone_device_pfn(pfn) || - PageCompound(pfn_to_page(pfn)))) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + pfn, PG_LEVEL_NUM)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -5581,13 +5528,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - spin_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + + write_lock(&kvm->mmu_lock); + slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (kvm->arch.tdp_mmu_enabled) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, @@ -5610,11 +5558,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -5625,40 +5573,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); - -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - spin_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - if (kvm->arch.tdp_mmu_enabled) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); - spin_unlock(&kvm->mmu_lock); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); -} -EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); - -void kvm_mmu_slot_set_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - bool flush; - - spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (kvm->arch.tdp_mmu_enabled) - flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); - spin_unlock(&kvm->mmu_lock); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); -} -EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); void kvm_mmu_zap_all(struct kvm *kvm) { @@ -5666,23 +5580,23 @@ void kvm_mmu_zap_all(struct kvm *kvm) LIST_HEAD(invalid_list); int ign; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - if (cond_resched_lock(&kvm->mmu_lock)) + if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) @@ -5742,7 +5656,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { kvm_mmu_commit_zap_page(kvm, @@ -5753,7 +5667,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); /* @@ -5970,10 +5884,11 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; @@ -5990,22 +5905,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - if (sp->tdp_mmu_page) - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); - else { + if (is_tdp_mmu_page(sp)) { + flush = kvm_tdp_mmu_zap_sp(kvm, sp); + } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index c8d51a37e2ce..ced15fd58fde 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct static_key mmu_audit_key; +static DEFINE_STATIC_KEY_FALSE(mmu_audit_key); static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_key_false((&mmu_audit_key))) + if (static_branch_unlikely((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - static_key_slow_inc(&mmu_audit_key); + static_branch_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - static_key_slow_dec(&mmu_audit_key); + static_branch_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bfc6389edc28..1f6f98c76bdf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -12,7 +12,7 @@ extern bool dbg; #define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0) #define MMU_WARN_ON(x) WARN_ON(x) #else #define pgprintk(x...) do { } while (0) @@ -56,7 +56,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; +#ifdef CONFIG_X86_64 bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; +#endif }; extern struct kmem_cache *mmu_page_header_cache; @@ -73,15 +78,23 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. + * When using the EPT page-modification log, the GPAs in the CPU dirty + * log would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages, which bypasses PML, since + * writes now result in a vmexit. Note, the check on CPU dirty logging + * being enabled is mandatory as the bits used to denote WP-only SPTEs + * are reserved for NPT w/ PAE (32-bit KVM). */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; + return vcpu->arch.mmu == &vcpu->arch.guest_mmu && + kvm_x86_ops.cpu_dirty_log_size; } bool is_nx_huge_page_enabled(void); @@ -133,6 +146,8 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 213699b27b44..e798489b56b5 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -381,6 +381,35 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_tdp_mmu_spte_changed, + TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte), + TP_ARGS(as_id, gfn, level, old_spte, new_spte), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, old_spte) + __field(u64, new_spte) + /* Level cannot be larger than 5 on x86, so it fits in a u8. */ + __field(u8, level) + /* as_id can only be 0 or 1 x86, so it fits in a u8. */ + __field(u8, as_id) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->old_spte = old_spte; + __entry->new_spte = new_spte; + __entry->level = level; + __entry->as_id = as_id; + ), + + TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx", + __entry->as_id, __entry->gfn, __entry->level, + __entry->old_spte, __entry->new_spte + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 8443a675715b..34bb0ec69bd8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 50e268eb8e1a..55d7b473ac44 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); @@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; + hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; int max_level; @@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -868,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, } r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + write_lock(&vcpu->kvm->mmu_lock); + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); @@ -881,7 +889,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -919,7 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -954,7 +962,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c51ad544f25b..ef55f0bc4ccf 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 2b3a30bd38b0..6de3950fd704 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -131,6 +131,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT /* + * If a thread running without exclusive control of the MMU lock must perform a + * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * non-present intermediate value. Other threads which encounter this value + * should not modify the SPTE. + * + * This constant works because it is considered non-present on both AMD and + * Intel CPUs and does not create a L1TF vulnerability because the pfn section + * is zeroed out. + * + * Only used by the TDP MMU. + */ +#define REMOVED_SPTE (1ull << 59) + +static inline bool is_removed_spte(u64 spte) +{ + return spte == REMOVED_SPTE; +} + +/* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll @@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline int is_shadow_present_pte(u64 pte) +static inline bool is_shadow_present_pte(u64 pte) { - return (pte != 0) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); } -static inline int is_large_pte(u64 pte) +static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } -static inline int is_last_spte(u64 pte, int level) +static inline bool is_last_spte(u64 pte, int level) { - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; + return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 87b7e16911db..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -21,25 +21,37 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) } /* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* * Sets a TDP iterator to walk a pre-order traversal of the paging structure - * rooted at root_pt, starting with the walk to translate goal_gfn. + * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, - int min_level, gfn_t goal_gfn) + int min_level, gfn_t next_last_level_gfn) { WARN_ON(root_level < 1); WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); - iter->goal_gfn = goal_gfn; + iter->next_last_level_gfn = next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = root_pt; - - iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -47,7 +59,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ -u64 *spte_to_child_pt(u64 spte, int level) +tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a @@ -56,7 +68,7 @@ u64 *spte_to_child_pt(u64 spte, int level) if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; - return __va(spte_to_pfn(spte) << PAGE_SHIFT); + return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* @@ -65,7 +77,7 @@ u64 *spte_to_child_pt(u64 spte, int level) */ static bool try_step_down(struct tdp_iter *iter) { - u64 *child_pt; + tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; @@ -74,7 +86,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -82,7 +94,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -106,9 +118,9 @@ static bool try_step_side(struct tdp_iter *iter) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); - iter->goal_gfn = iter->gfn; + iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); return true; } @@ -158,25 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -/* - * Restart the walk over the paging structure from the root, starting from the - * highest gfn the iterator had previously reached. Assumes that the entire - * paging structure, except the root page, may have been completely torn down - * and rebuilt. - */ -void tdp_iter_refresh_walk(struct tdp_iter *iter) -{ - gfn_t goal_gfn = iter->goal_gfn; - - if (iter->gfn > goal_gfn) - goal_gfn = iter->gfn; - - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, goal_gfn); -} - -u64 *tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 47170d0dc98e..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,6 +7,8 @@ #include "mmu.h" +typedef u64 __rcu *tdp_ptep_t; + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -15,11 +17,17 @@ struct tdp_iter { * The iterator will traverse the paging structure towards the mapping * for this GFN. */ - gfn_t goal_gfn; + gfn_t next_last_level_gfn; + /* + * The next_last_level_gfn at the time when the thread last + * yielded. Only yielding when the next_last_level_gfn != + * yielded_gfn helps ensure forward progress. + */ + gfn_t yielded_gfn; /* Pointers to the page tables traversed to reach the current SPTE */ - u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ - u64 *sptep; + tdp_ptep_t sptep; /* The lowest GFN mapped by the current SPTE */ gfn_t gfn; /* The level of the root page given to the iterator */ @@ -28,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -49,12 +59,11 @@ struct tdp_iter { #define for_each_tdp_pte(iter, root, root_level, start, end) \ for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) -u64 *spte_to_child_pt(u64 pte, int level); +tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, - int min_level, gfn_t goal_gfn); + int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -void tdp_iter_refresh_walk(struct tdp_iter *iter); -u64 *tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 84c8f06bec26..018d82e73e31 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,30 +7,23 @@ #include "tdp_mmu.h" #include "spte.h" -#ifdef CONFIG_X86_64 +#include <asm/cmpxchg.h> +#include <trace/events/kvm.h> + static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); -#endif - -static bool is_tdp_mmu_enabled(void) -{ -#ifdef CONFIG_X86_64 - return tdp_enabled && READ_ONCE(tdp_mmu_enabled); -#else - return false; -#endif /* CONFIG_X86_64 */ -} /* Initializes the TDP MMU for the VM, if enabled. */ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - if (!is_tdp_mmu_enabled()) + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) return; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } @@ -40,42 +33,73 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) return; WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages + * can run before the VM is torn down. + */ + rcu_barrier(); } -#define for_each_tdp_mmu_root(_kvm, _root) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) +static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + if (kvm_mmu_put_root(kvm, root)) + kvm_tdp_mmu_free_root(kvm, root); +} -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, + struct kvm_mmu_page *root) { - struct kvm_mmu_page *sp; + lockdep_assert_held_write(&kvm->mmu_lock); - if (!kvm->arch.tdp_mmu_enabled) - return false; - if (WARN_ON(!VALID_PAGE(hpa))) + if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) return false; - sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; + kvm_mmu_get_root(kvm, root); + return true; - return sp->tdp_mmu_page && sp->root_count; } +static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + struct kvm_mmu_page *next_root; + + next_root = list_next_entry(root, link); + tdp_mmu_put_root(kvm, root); + return next_root; +} + +/* + * Note: this iterator gets and puts references to the roots it iterates over. + * This makes it safe to release the MMU lock and yield within the loop, but + * if exiting the loop early, the caller must drop the reference to the most + * recent root. (Unless keeping a live reference is desirable.) + */ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ + typeof(*_root), link); \ + tdp_mmu_next_root_valid(_kvm, _root); \ + _root = tdp_mmu_next_root(_kvm, _root)) + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(root->root_count); WARN_ON(!root->tdp_mmu_page); list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -108,6 +132,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, sp->gfn = gfn; sp->tdp_mmu_page = true; + trace_kvm_mmu_get_page(sp, true); + return sp; } @@ -119,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } } @@ -135,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } @@ -151,14 +177,32 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level); +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) { - return sp->role.smm ? 1 : 0; + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + + tdp_mmu_free_sp(sp); } +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared); + static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); @@ -185,8 +229,146 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, if ((!is_writable_pte(old_spte) || pfn_changed) && is_writable_pte(new_spte)) { slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(slot, gfn); + mark_page_dirty_in_slot(kvm, slot, gfn); + } +} + +/** + * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the new page + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + * @account_nx: This page replaces a NX large page and should be marked for + * eventual reclaim. + */ +static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared, bool account_nx) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the page to be removed + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + */ +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * + * @kvm: kvm instance + * @pt: the page removed from the paging structure + * @shared: This operation may not be running under the exclusive use + * of the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. + * + * Given a page table that has been removed from the TDP paging structure, + * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. + */ +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, + bool shared) +{ + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); + int level = sp->role.level; + gfn_t base_gfn = sp->gfn; + u64 old_child_spte; + u64 *sptep; + gfn_t gfn; + int i; + + trace_kvm_mmu_prepare_zap_page(sp); + + tdp_mmu_unlink_page(kvm, sp, shared); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + sptep = rcu_dereference(pt) + i; + gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + + if (shared) { + /* + * Set the SPTE to a nonpresent value that other + * threads will not overwrite. If the SPTE was + * already marked as removed then another thread + * handling a page fault could overwrite it, so + * set the SPTE until it is set from some other + * value to the removed SPTE value. + */ + for (;;) { + old_child_spte = xchg(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_child_spte)) + break; + cpu_relax(); + } + } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ + old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; + + /* + * Marking the SPTE as a removed SPTE is not + * strictly necessary here as the MMU lock will + * stop other threads from concurrently modifying + * this SPTE. Using the removed SPTE value keeps + * the two branches consistent and simplifies + * the function. + */ + WRITE_ONCE(*sptep, REMOVED_SPTE); + } + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, + old_child_spte, REMOVED_SPTE, level - 1, + shared); } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } /** @@ -197,22 +379,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. * This function must be called for all TDP SPTE modifications. */ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - u64 *pt; - struct kvm_mmu_page *sp; - u64 old_child_spte; - int i; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -244,6 +426,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (old_spte == new_spte) return; + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -251,15 +435,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE, it is - * unexpected. Log the change, though it should not impact the - * guest since both the former and current SPTEs are nonpresent. + * If this change does not involve a MMIO SPTE or removed SPTE, + * it is unexpected. Log the change, though it should not + * impact the guest since both the former and current SPTEs + * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + if (WARN_ON(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" - "are MMIO SPTEs.\n" + "are MMIO SPTEs, or the new SPTE is\n" + "a temporary removed SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -274,57 +462,124 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Recursively handle child PTs if the change removed a subtree from * the paging structure. */ - if (was_present && !was_leaf && (pfn_changed || !is_present)) { - pt = spte_to_child_pt(old_spte, level); - sp = sptep_to_sp(pt); + if (was_present && !was_leaf && (pfn_changed || !is_present)) + handle_removed_tdp_mmu_page(kvm, + spte_to_child_pt(old_spte, level), shared); +} - list_del(&sp->link); +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, + shared); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); +/* + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the + * associated bookkeeping + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + lockdep_assert_held_read(&kvm->mmu_lock); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - old_child_spte = READ_ONCE(*(pt + i)); - WRITE_ONCE(*(pt + i), 0); - handle_changed_spte(kvm, as_id, - gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1); - } + /* + * Do not change removed SPTEs. Only the thread that froze the SPTE + * may modify it. + */ + if (iter->old_spte == REMOVED_SPTE) + return false; + + if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, + new_spte) != iter->old_spte) + return false; - kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); - } + return true; } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) +static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + /* + * Freeze the SPTE by setting it to a special, + * non-present value. This will stop other threads from + * immediately installing a present entry in its place + * before the TLBs are flushed. + */ + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + return false; + + kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, + KVM_PAGES_PER_HPAGE(iter->level)); + + /* + * No other thread can overwrite the removed SPTE as they + * must either wait on the MMU lock or use + * tdp_mmu_set_spte_atomic which will not overrite the + * special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present + * to non-present. + */ + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); + + return true; } + +/* + * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * @record_acc_track: Notify the MM subsystem of changes to the accessed state + * of the page. Should be set unless handling an MMU + * notifier for access tracking. Leaving record_acc_track + * unset in that case prevents page accesses from being + * double counted. + * @record_dirty_log: Record the page as dirty in the dirty bitmap if + * appropriate for the change being made. Should be set + * unless performing certain dirty logging operations. + * Leaving record_dirty_log unset in that case prevents page + * writes from being double counted. + */ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * No thread should be using this function to set SPTEs to the + * temporary removed SPTE value. + * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic + * should be used. If operating under the MMU lock in write mode, the + * use of the removed SPTE should not be necessary. + */ + WARN_ON(iter->old_spte == REMOVED_SPTE); - WRITE_ONCE(*iter->sptep, new_spte); + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -364,27 +619,44 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, _mmu->shadow_root_level, _start, _end) /* - * Flush the TLB if the process should drop kvm->mmu_lock. - * Return whether the caller still needs to flush the tlb. + * Yield if the MMU lock is contended or this thread needs to return control + * to the scheduler. + * + * If this function should yield and flush is set, it will perform a remote + * TLB flush before yielding. + * + * If this function yields, it will also reset the tdp_iter's walk over the + * paging structure and the calling function should skip to the next + * iteration to allow the iterator to continue its traversal from the + * paging structure root. + * + * Return true if this function yielded and the iterator's traversal was reset. + * Return false if a yield was not needed. */ -static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, bool flush) { - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_flush_remote_tlbs(kvm); - cond_resched_lock(&kvm->mmu_lock); - tdp_iter_refresh_walk(iter); + /* Ensure forward progress has been made before yielding. */ + if (iter->next_last_level_gfn == iter->yielded_gfn) return false; - } else { + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + rcu_read_unlock(); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_lock(); + + WARN_ON(iter->gfn > iter->next_last_level_gfn); + + tdp_iter_restart(iter); + return true; } -} -static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) -{ - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - cond_resched_lock(&kvm->mmu_lock); - tdp_iter_refresh_walk(iter); - } + return false; } /* @@ -396,15 +668,24 @@ static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; + + rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { + if (can_yield && + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; + continue; + } + if (!is_shadow_present_pte(iter.old_spte)) continue; @@ -419,13 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - - if (can_yield) - flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); - else - flush_needed = true; + flush = true; } - return flush_needed; + + rcu_read_unlock(); + return flush; } /* @@ -434,22 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - - flush |= zap_gfn_range(kvm, root, start, end, true); - - kvm_mmu_put_root(kvm, root); - } + for_each_tdp_mmu_root_yield_safe(kvm, root) + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } @@ -477,10 +748,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, int ret = 0; int make_spte_ret = 0; - if (unlikely(is_noslot_pfn(pfn))) { + if (unlikely(is_noslot_pfn(pfn))) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); - } else + else make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, @@ -488,8 +758,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else - tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + return RET_PF_RETRY; /* * If the page fault was caused by a write but the page is write @@ -503,10 +773,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) + if (unlikely(is_mmio_spte(new_spte))) { + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); ret = RET_PF_EMULATE; + } else + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -544,6 +820,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); + + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { if (nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(iter.old_spte, gfn, @@ -559,49 +838,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - tdp_mmu_set_spte(vcpu->kvm, &iter, 0); - - kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, - KVM_PAGES_PER_HPAGE(iter.level)); + if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + break; /* * The iter must explicitly re-read the spte here * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*iter.sptep); + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); } if (!is_shadow_present_pte(iter.old_spte)) { sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); - list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); child_pt = sp->spt; - clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - trace_kvm_mmu_get_page(sp, true); - if (huge_page_disallowed && req_level >= iter.level) - account_huge_nx_page(vcpu->kvm, sp); - - tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, + new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, true, + huge_page_disallowed && + req_level >= iter.level); + + trace_kvm_mmu_get_page(sp, true); + } else { + tdp_mmu_free_sp(sp); + break; + } } } - if (WARN_ON(iter.level != level)) + if (iter.level != level) { + rcu_read_unlock(); return RET_PF_RETRY; + } ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, pfn, prefault); + rcu_read_unlock(); return ret; } -static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned long data, - int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long data)) +static __always_inline int +kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, + gfn_t start, + gfn_t end, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -609,13 +900,7 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, int ret = 0; int as_id; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - + for_each_tdp_mmu_root_yield_safe(kvm, root) { as_id = kvm_mmu_page_as_id(root); slots = __kvm_memslots(kvm, as_id); kvm_for_each_memslot(memslot, slots) { @@ -637,8 +922,6 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, ret |= handler(kvm, memslot, root, gfn_start, gfn_end, data); } - - kvm_mmu_put_root(kvm, root); } return ret; @@ -649,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, @@ -671,6 +954,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, int young = 0; u64 new_spte = 0; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { /* * If we have a non-accessed entry we don't need to change the @@ -698,8 +983,12 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; + + trace_kvm_age_page(iter.gfn, iter.level, slot, young); } + rcu_read_unlock(); + return young; } @@ -745,6 +1034,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, u64 new_spte; int need_flush = 0; + rcu_read_lock(); + WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); @@ -773,6 +1064,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (need_flush) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + rcu_read_unlock(); + return 0; } @@ -796,21 +1089,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; + if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level) || + !(iter.old_spte & PT_WRITABLE_MASK)) continue; new_spte = iter.old_spte & ~PT_WRITABLE_MASK; tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); } + + rcu_read_unlock(); return spte_set; } @@ -826,21 +1125,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -860,7 +1151,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; + if (spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -875,9 +1171,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); } + + rcu_read_unlock(); return spte_set; } @@ -894,21 +1190,13 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -927,6 +1215,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; u64 new_spte; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) @@ -936,6 +1226,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, !(mask & (1UL << (iter.gfn - gfn)))) continue; + mask &= ~(1UL << (iter.gfn - gfn)); + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -949,9 +1241,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); - - mask &= ~(1UL << (iter.gfn - gfn)); } + + rcu_read_unlock(); } /* @@ -969,7 +1261,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root; int root_as_id; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) @@ -980,89 +1272,43 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, } /* - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is - * only used for PML, and so will involve setting the dirty bit on each SPTE. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. - */ -static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) -{ - struct tdp_iter iter; - u64 new_spte; - bool spte_set = false; - - tdp_root_for_each_pte(iter, root, start, end) { - if (!is_shadow_present_pte(iter.old_spte)) - continue; - - new_spte = iter.old_spte | shadow_dirty_mask; - - tdp_mmu_set_spte(kvm, &iter, new_spte); - spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); - } - - return spte_set; -} - -/* - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is - * only used for PML, and so will involve setting the dirty bit on each SPTE. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. - */ -bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - struct kvm_mmu_page *root; - int root_as_id; - bool spte_set = false; - - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - - spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); - } - return spte_set; -} - -/* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Clear leaf entries which could be replaced by large mappings, for + * GFNs within the slot. */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) + struct kvm_memory_slot *slot) { + gfn_t start = slot->base_gfn; + gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { + spte_set = false; + continue; + } + if (!is_shadow_present_pte(iter.old_spte) || - is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level)) continue; pfn = spte_to_pfn(iter.old_spte); if (kvm_is_reserved_pfn(pfn) || - !PageTransCompoundMap(pfn_to_page(pfn))) + iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, + pfn, PG_LEVEL_NUM)) continue; tdp_mmu_set_spte(kvm, &iter, 0); - spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + spte_set = true; } + rcu_read_unlock(); if (spte_set) kvm_flush_remote_tlbs(kvm); } @@ -1072,26 +1318,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm, * be replaced by large mappings, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) + struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; int root_as_id; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - - zap_collapsible_spte_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); + zap_collapsible_spte_range(kvm, root, slot); } } @@ -1107,6 +1344,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { if (!is_writable_pte(iter.old_spte)) break; @@ -1118,6 +1357,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); + return spte_set; } @@ -1133,7 +1374,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, int root_as_id; bool spte_set = false; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) @@ -1148,17 +1389,24 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - int leaf = vcpu->arch.mmu->shadow_root_level; gfn_t gfn = addr >> PAGE_SHIFT; + int leaf = -1; + + *root_level = vcpu->arch.mmu->shadow_root_level; + + rcu_read_lock(); tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; - sptes[leaf - 1] = iter.old_spte; + sptes[leaf] = iter.old_spte; } + rcu_read_unlock(); + return leaf; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 556e065503f6..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,14 +5,32 @@ #include <linux/kvm_host.h> -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); - -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -37,12 +55,41 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); + struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level); + +#ifdef CONFIG_X86_64 +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } +#else +static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +#endif + +static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + if (!is_tdp_mmu_enabled(kvm)) + return false; + if (WARN_ON(!VALID_PAGE(hpa))) + return false; + + sp = to_shadow_page(hpa); + if (WARN_ON(!sp)) + return false; + + return is_tdp_mmu_page(sp) && sp->root_count; +} + #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 7f0059aa30e1..a8502e02f479 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* variable MTRRs */ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) @@ -84,12 +84,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else /* MTRR mask */ mask |= 0x7ff; - if (data & mask) { - kvm_inject_gp(vcpu, 0); - return false; - } - return true; + return (data & mask) == 0; } EXPORT_SYMBOL_GPL(kvm_mtrr_valid); @@ -355,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); - /* Extend the mask with all 1 bits to the left, since those - * bits must implicitly be 0. The bits are then cleared - * when reading them. + /* + * Set all illegal GPA bits in the mask, since those bits must + * implicitly be 0. The bits are then cleared when reading them. */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); + cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -430,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; - *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; + *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } return 0; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 67741d2a0308..827886c12c16 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 1; if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && - (kvm_x86_ops.get_cpl(vcpu) != 0) && + (static_call(kvm_x86_get_cpl)(vcpu) != 0) && (kvm_read_cr0(vcpu) & X86_CR0_PE)) return 1; @@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - if (lapic_in_kernel(vcpu)) + if (lapic_in_kernel(vcpu)) { + if (kvm_x86_ops.pmu_ops->deliver_pmi) + kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); + } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) @@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); } + if (kvm_x86_ops.pmu_ops->cleanup) + kvm_x86_ops.pmu_ops->cleanup(vcpu); + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 067fef51760c..7b30bc967af3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,8 @@ struct kvm_pmu_ops { void (*refresh)(struct kvm_vcpu *vcpu); void (*init)(struct kvm_vcpu *vcpu); void (*reset)(struct kvm_vcpu *vcpu); + void (*deliver_pmi)(struct kvm_vcpu *vcpu); + void (*cleanup)(struct kvm_vcpu *vcpu); }; static inline u64 pmc_bitmask(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8c550999ace0..78bdcfac4e40 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -233,7 +233,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, */ static int avic_update_access_page(struct kvm *kvm, bool activate) { - int ret = 0; + void __user *ret; + int r = 0; mutex_lock(&kvm->slots_lock); /* @@ -249,13 +250,15 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, activate ? PAGE_SIZE : 0); - if (ret) + if (IS_ERR(ret)) { + r = PTR_ERR(ret); goto out; + } kvm->arch.apic_access_page_done = activate; out: mutex_unlock(&kvm->slots_lock); - return ret; + return r; } static int avic_init_backing_page(struct kvm_vcpu *vcpu) @@ -295,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, + u32 icrl, u32 icrh) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, source, + icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } +} + int avic_incomplete_ipi_interception(struct vcpu_svm *svm) { u32 icrh = svm->vmcb->control.exit_info_1 >> 32; @@ -321,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR2, icrh); kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; - case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { - int i; - struct kvm_vcpu *vcpu; - struct kvm *kvm = svm->vcpu.kvm; - struct kvm_lapic *apic = svm->vcpu.arch.apic; - + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: /* * At this point, we expect that the AVIC HW has already * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, apic, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); - } + avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh); break; - } case AVIC_IPI_FAILURE_INVALID_TARGET: WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", index, svm->vcpu.vcpu_id, icrh, icrl); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c9e7b86350d6..fb204eaa8bb3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } +static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) +{ + struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(!is_guest_mode(vcpu)); + + if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && + !svm->nested.nested_run_pending) { + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = fault->error_code; + svm->vmcb->control.exit_info_2 = fault->address; + nested_svm_vmexit(svm); + } else { + kvm_inject_page_fault(vcpu, fault); + } +} + static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); @@ -58,7 +75,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, offset_in_page(cr3) + index * 8, 8); if (ret) return 0; @@ -199,6 +216,10 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + + if (WARN_ON(!is_guest_mode(vcpu))) + return true; + if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -227,6 +248,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { + struct kvm_vcpu *vcpu = &svm->vcpu; bool vmcb12_lma; /* @@ -247,21 +269,13 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!vmcb12_lma) { - if (vmcb12->save.cr4 & X86_CR4_PAE) { - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) - return false; - } else { - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) - return false; - } - } else { + if (vmcb12_lma) { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) + kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; return true; @@ -355,7 +369,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && @@ -388,7 +402,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.ds = vmcb12->save.ds; svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; - kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); /* * Force-set EFER_SVME even though it is checked earlier on the @@ -408,8 +422,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rax = vmcb12->save.rax; svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; - svm->vmcb->save.dr7 = vmcb12->save.dr7; - svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; svm->vmcb->save.cpl = vmcb12->save.cpl; } @@ -453,15 +467,32 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, { int ret; + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); + + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); + + svm->nested.vmcb12_gpa = vmcb12_gpa; - nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); + nested_prepare_vmcb_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); if (ret) return ret; + if (!npt_enabled) + svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + svm_set_gif(svm, true); return 0; @@ -508,18 +539,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm) goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, - vmcb12->save.rip, - vmcb12->control.int_ctl, - vmcb12->control.event_inj, - vmcb12->control.nested_ctl); - - trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, - vmcb12->control.intercepts[INTERCEPT_CR] >> 16, - vmcb12->control.intercepts[INTERCEPT_EXCEPTION], - vmcb12->control.intercepts[INTERCEPT_WORD3], - vmcb12->control.intercepts[INTERCEPT_WORD4], - vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -611,6 +630,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -676,13 +697,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; kvm_set_rflags(&svm->vcpu, hsave->save.rflags); + kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); kvm_rax_write(&svm->vcpu, hsave->save.rax); kvm_rsp_write(&svm->vcpu, hsave->save.rsp); kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = 0; + svm->vmcb->save.dr7 = DR7_FIXED_1; svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; @@ -769,6 +791,7 @@ void svm_leave_nested(struct vcpu_svm *svm) leave_guest_mode(&svm->vcpu); copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); + vmcb_mark_all_dirty(svm->vmcb); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); @@ -1211,6 +1234,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * in the registers, the save area of the nested state instead * contains saved L1 state. */ + + svm->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 035da07500e8..fdf587f19c5f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr) static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTL3: case MSR_F15H_PERF_CTL4: case MSR_F15H_PERF_CTL5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; @@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTR3: case MSR_F15H_PERF_CTR4: case MSR_F15H_PERF_CTR5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 566f4d18185b..874ea309279f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -14,10 +14,21 @@ #include <linux/psp-sev.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/processor.h> +#include <linux/trace_events.h> +#include <asm/fpu/internal.h> + +#include <asm/trapnr.h> #include "x86.h" #include "svm.h" +#include "svm_ops.h" +#include "cpuid.h" +#include "trace.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) +static u8 sev_enc_bit; static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -25,7 +36,6 @@ unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) struct enc_region { struct list_head list; @@ -57,19 +67,19 @@ static int sev_flush_asids(void) } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(void) +static bool __sev_recycle_asids(int min_asid, int max_asid) { int pos; /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, - max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) + pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid); + if (pos >= max_asid) return false; if (sev_flush_asids()) return false; + /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, max_sev_asid); bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); @@ -77,20 +87,23 @@ static bool __sev_recycle_asids(void) return true; } -static int sev_asid_new(void) +static int sev_asid_new(struct kvm_sev_info *sev) { + int pos, min_asid, max_asid; bool retry = true; - int pos; mutex_lock(&sev_bitmap_lock); /* - * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. */ + min_asid = sev->es_active ? 0 : min_sev_asid - 1; + max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) { - if (retry && __sev_recycle_asids()) { + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); + if (pos >= max_asid) { + if (retry && __sev_recycle_asids(min_asid, max_asid)) { retry = false; goto again; } @@ -172,7 +185,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - asid = sev_asid_new(); + asid = sev_asid_new(sev); if (asid < 0) return ret; @@ -191,6 +204,16 @@ e_free: return ret; } +static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + if (!sev_es) + return -ENOTTY; + + to_kvm_svm(kvm)->sev_info.es_active = true; + + return sev_guest_init(kvm, argp); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { struct sev_data_activate *data; @@ -320,6 +343,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long first, last; int ret; + lockdep_assert_held(&kvm->lock); + if (ulen == 0 || uaddr + ulen < uaddr) return ERR_PTR(-EINVAL); @@ -490,6 +515,96 @@ e_free: return ret; } +static int sev_es_sync_vmsa(struct vcpu_svm *svm) +{ + struct vmcb_save_area *save = &svm->vmcb->save; + + /* Check some debug related fields before encrypting the VMSA */ + if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + return -EINVAL; + + /* Sync registgers */ + save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; + save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; + save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; + save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; + save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; + save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; + save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; +#ifdef CONFIG_X86_64 + save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; + save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; + save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; + save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; + save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; + save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; + save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; + save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; +#endif + save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; + + /* Sync some non-GPR registers before encrypting */ + save->xcr0 = svm->vcpu.arch.xcr0; + save->pkru = svm->vcpu.arch.pkru; + save->xss = svm->vcpu.arch.ia32_xss; + + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(svm->vmsa, save, sizeof(*save)); + + return 0; +} + +static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_update_vmsa *vmsa; + int i, ret; + + if (!sev_es_guest(kvm)) + return -ENOTTY; + + vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL); + if (!vmsa) + return -ENOMEM; + + for (i = 0; i < kvm->created_vcpus; i++) { + struct vcpu_svm *svm = to_svm(kvm->vcpus[i]); + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); + if (ret) + goto e_free; + + /* + * The LAUNCH_UPDATE_VMSA command will perform in-place + * encryption of the VMSA memory content (i.e it will write + * the same memory region with the guest's key), so invalidate + * it first. + */ + clflush_cache_range(svm->vmsa, PAGE_SIZE); + + vmsa->handle = sev->handle; + vmsa->address = __sme_pa(svm->vmsa); + vmsa->len = PAGE_SIZE; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa, + &argp->error); + if (ret) + goto e_free; + + svm->vcpu.arch.guest_state_protected = true; + } + +e_free: + kfree(vmsa); + return ret; +} + static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = (void __user *)(uintptr_t)argp->data; @@ -927,12 +1042,80 @@ e_unpin_memory: return ret; } +static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + void __user *report = (void __user *)(uintptr_t)argp->data; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_attestation_report *data; + struct kvm_sev_attestation_report params; + void __user *p; + void *blob = NULL; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + /* User wants to query the blob length */ + if (!params.len) + goto cmd; + + p = (void __user *)(uintptr_t)params.uaddr; + if (p) { + if (params.len > SEV_FW_BLOB_MAX_SIZE) { + ret = -EINVAL; + goto e_free; + } + + ret = -ENOMEM; + blob = kmalloc(params.len, GFP_KERNEL); + if (!blob) + goto e_free; + + data->address = __psp_pa(blob); + data->len = params.len; + memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce)); + } +cmd: + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error); + /* + * If we query the session length, FW responded with expected data. + */ + if (!params.len) + goto done; + + if (ret) + goto e_free_blob; + + if (blob) { + if (copy_to_user(p, blob, params.len)) + ret = -EFAULT; + } + +done: + params.len = data->len; + if (copy_to_user(report, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; - if (!svm_sev_enabled()) + if (!svm_sev_enabled() || !sev) return -ENOTTY; if (!argp) @@ -947,12 +1130,18 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_ES_INIT: + r = sev_es_guest_init(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_UPDATE_DATA: r = sev_launch_update_data(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_UPDATE_VMSA: + r = sev_launch_update_vmsa(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_MEASURE: r = sev_launch_measure(kvm, &sev_cmd); break; @@ -971,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_SECRET: r = sev_launch_secret(kvm, &sev_cmd); break; + case KVM_SEV_GET_ATTESTATION_REPORT: + r = sev_get_attestation_report(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1001,12 +1193,20 @@ int svm_register_enc_region(struct kvm *kvm, if (!region) return -ENOMEM; + mutex_lock(&kvm->lock); region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); + mutex_unlock(&kvm->lock); goto e_free; } + region->uaddr = range->addr; + region->size = range->size; + + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + /* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are @@ -1015,13 +1215,6 @@ int svm_register_enc_region(struct kvm *kvm, */ sev_clflush_pages(region->pages, region->npages); - region->uaddr = range->addr; - region->size = range->size; - - mutex_lock(&kvm->lock); - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - return ret; e_free: @@ -1125,49 +1318,61 @@ void sev_vm_destroy(struct kvm *kvm) sev_asid_free(sev->asid); } -int __init sev_hardware_setup(void) +void __init sev_hardware_setup(void) { - struct sev_user_data_status *status; - int rc; + unsigned int eax, ebx, ecx, edx; + bool sev_es_supported = false; + bool sev_supported = false; + + /* Does the CPU support SEV? */ + if (!boot_cpu_has(X86_FEATURE_SEV)) + goto out; + + /* Retrieve SEV CPUID information */ + cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); + + /* Set encryption bit location for SEV-ES guests */ + sev_enc_bit = ebx & 0x3f; /* Maximum number of encrypted guests supported simultaneously */ - max_sev_asid = cpuid_ecx(0x8000001F); + max_sev_asid = ecx; if (!svm_sev_enabled()) - return 1; + goto out; /* Minimum ASID value that should be used for SEV guest */ - min_sev_asid = cpuid_edx(0x8000001F); + min_sev_asid = edx; /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) - return 1; + goto out; sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_reclaim_asid_bitmap) - return 1; + goto out; - status = kmalloc(sizeof(*status), GFP_KERNEL); - if (!status) - return 1; + pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1); + sev_supported = true; - /* - * Check SEV platform status. - * - * PLATFORM_STATUS can be called in any state, if we failed to query - * the PLATFORM status then either PSP firmware does not support SEV - * feature or SEV firmware is dead. - */ - rc = sev_platform_status(status, NULL); - if (rc) - goto err; + /* SEV-ES support requested? */ + if (!sev_es) + goto out; - pr_info("SEV supported\n"); + /* Does the CPU support SEV-ES? */ + if (!boot_cpu_has(X86_FEATURE_SEV_ES)) + goto out; -err: - kfree(status); - return rc; + /* Has the system been allocated ASIDs for SEV-ES? */ + if (min_sev_asid == 1) + goto out; + + pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1); + sev_es_supported = true; + +out: + sev = sev_supported; + sev_es = sev_es_supported; } void sev_hardware_teardown(void) @@ -1181,13 +1386,327 @@ void sev_hardware_teardown(void) sev_flush_asids(); } +/* + * Pages used by hardware to hold guest encrypted state must be flushed before + * returning them to the system. + */ +static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va, + unsigned long len) +{ + /* + * If hardware enforced cache coherency for encrypted mappings of the + * same physical page is supported, nothing to do. + */ + if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) + return; + + /* + * If the VM Page Flush MSR is supported, use it to flush the page + * (using the page virtual address and the guest ASID). + */ + if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) { + struct kvm_sev_info *sev; + unsigned long va_start; + u64 start, stop; + + /* Align start and stop to page boundaries. */ + va_start = (unsigned long)va; + start = (u64)va_start & PAGE_MASK; + stop = PAGE_ALIGN((u64)va_start + len); + + if (start < stop) { + sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + + while (start < stop) { + wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, + start | sev->asid); + + start += PAGE_SIZE; + } + + return; + } + + WARN(1, "Address overflow, using WBINVD\n"); + } + + /* + * Hardware should always have one of the above features, + * but if not, use WBINVD and issue a warning. + */ + WARN_ONCE(1, "Using WBINVD to flush guest memory\n"); + wbinvd_on_all_cpus(); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm; + + if (!sev_es_guest(vcpu->kvm)) + return; + + svm = to_svm(vcpu); + + if (vcpu->arch.guest_state_protected) + sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->vmsa)); + + if (svm->ghcb_sa_free) + kfree(svm->ghcb_sa); +} + +static void dump_ghcb(struct vcpu_svm *svm) +{ + struct ghcb *ghcb = svm->ghcb; + unsigned int nbits; + + /* Re-use the dump_invalid_vmcb module parameter */ + if (!dump_invalid_vmcb) { + pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); + return; + } + + nbits = sizeof(ghcb->save.valid_bitmap) * 8; + + pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", + ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", + ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", + ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", + ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); +} + +static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct ghcb *ghcb = svm->ghcb; + + /* + * The GHCB protocol so far allows for the following data + * to be returned: + * GPRs RAX, RBX, RCX, RDX + * + * Copy their values, even if they may not have been written during the + * VM-Exit. It's the guest's responsibility to not consume random data. + */ + ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); + ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); + ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); + ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); +} + +static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct ghcb *ghcb = svm->ghcb; + u64 exit_code; + + /* + * The GHCB protocol so far allows for the following data + * to be supplied: + * GPRs RAX, RBX, RCX, RDX + * XCR0 + * CPL + * + * VMMCALL allows the guest to provide extra registers. KVM also + * expects RSI for hypercalls, so include that, too. + * + * Copy their values to the appropriate location if supplied. + */ + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + + vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb); + + svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb); + + if (ghcb_xcr0_is_valid(ghcb)) { + vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); + kvm_update_cpuid_runtime(vcpu); + } + + /* Copy the GHCB exit information into the VMCB fields */ + exit_code = ghcb_get_sw_exit_code(ghcb); + control->exit_code = lower_32_bits(exit_code); + control->exit_code_hi = upper_32_bits(exit_code); + control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); + control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static int sev_es_validate_vmgexit(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu; + struct ghcb *ghcb; + u64 exit_code = 0; + + ghcb = svm->ghcb; + + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) + goto vmgexit_err; + + /* + * Retrieve the exit code now even though is may not be marked valid + * as it could help with debugging. + */ + exit_code = ghcb_get_sw_exit_code(ghcb); + + if (!ghcb_sw_exit_code_is_valid(ghcb) || + !ghcb_sw_exit_info_1_is_valid(ghcb) || + !ghcb_sw_exit_info_2_is_valid(ghcb)) + goto vmgexit_err; + + switch (ghcb_get_sw_exit_code(ghcb)) { + case SVM_EXIT_READ_DR7: + break; + case SVM_EXIT_WRITE_DR7: + if (!ghcb_rax_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_RDTSC: + break; + case SVM_EXIT_RDPMC: + if (!ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_CPUID: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + if (ghcb_get_rax(ghcb) == 0xd) + if (!ghcb_xcr0_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_INVD: + break; + case SVM_EXIT_IOIO: + if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { + if (!ghcb_sw_scratch_is_valid(ghcb)) + goto vmgexit_err; + } else { + if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) + if (!ghcb_rax_is_valid(ghcb)) + goto vmgexit_err; + } + break; + case SVM_EXIT_MSR: + if (!ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + if (ghcb_get_sw_exit_info_1(ghcb)) { + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rdx_is_valid(ghcb)) + goto vmgexit_err; + } + break; + case SVM_EXIT_VMMCALL: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_cpl_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_RDTSCP: + break; + case SVM_EXIT_WBINVD: + break; + case SVM_EXIT_MONITOR: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb) || + !ghcb_rdx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_MWAIT: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_MMIO_READ: + case SVM_VMGEXIT_MMIO_WRITE: + if (!ghcb_sw_scratch_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_NMI_COMPLETE: + case SVM_VMGEXIT_AP_HLT_LOOP: + case SVM_VMGEXIT_AP_JUMP_TABLE: + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + break; + default: + goto vmgexit_err; + } + + return 0; + +vmgexit_err: + vcpu = &svm->vcpu; + + if (ghcb->ghcb_usage) { + vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", + ghcb->ghcb_usage); + } else { + vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n", + exit_code); + dump_ghcb(svm); + } + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + + return -EINVAL; +} + +static void pre_sev_es_run(struct vcpu_svm *svm) +{ + if (!svm->ghcb) + return; + + if (svm->ghcb_sa_free) { + /* + * The scratch area lives outside the GHCB, so there is a + * buffer that, depending on the operation performed, may + * need to be synced, then freed. + */ + if (svm->ghcb_sa_sync) { + kvm_write_guest(svm->vcpu.kvm, + ghcb_get_sw_scratch(svm->ghcb), + svm->ghcb_sa, svm->ghcb_sa_len); + svm->ghcb_sa_sync = false; + } + + kfree(svm->ghcb_sa); + svm->ghcb_sa = NULL; + svm->ghcb_sa_free = false; + } + + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + + sev_es_sync_to_ghcb(svm); + + kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); + svm->ghcb = NULL; +} + void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu(svm_data, cpu); int asid = sev_get_asid(svm->vcpu.kvm); + /* Perform any SEV-ES pre-run actions */ + pre_sev_es_run(svm); + /* Assign the asid allocated with this SEV guest */ - svm->vmcb->control.asid = asid; + svm->asid = asid; /* * Flush guest TLB: @@ -1203,3 +1722,387 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } + +#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) +static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct ghcb *ghcb = svm->ghcb; + u64 ghcb_scratch_beg, ghcb_scratch_end; + u64 scratch_gpa_beg, scratch_gpa_end; + void *scratch_va; + + scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); + if (!scratch_gpa_beg) { + pr_err("vmgexit: scratch gpa not provided\n"); + return false; + } + + scratch_gpa_end = scratch_gpa_beg + len; + if (scratch_gpa_end < scratch_gpa_beg) { + pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", + len, scratch_gpa_beg); + return false; + } + + if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { + /* Scratch area begins within GHCB */ + ghcb_scratch_beg = control->ghcb_gpa + + offsetof(struct ghcb, shared_buffer); + ghcb_scratch_end = control->ghcb_gpa + + offsetof(struct ghcb, reserved_1); + + /* + * If the scratch area begins within the GHCB, it must be + * completely contained in the GHCB shared buffer area. + */ + if (scratch_gpa_beg < ghcb_scratch_beg || + scratch_gpa_end > ghcb_scratch_end) { + pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", + scratch_gpa_beg, scratch_gpa_end); + return false; + } + + scratch_va = (void *)svm->ghcb; + scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + } else { + /* + * The guest memory must be read into a kernel buffer, so + * limit the size + */ + if (len > GHCB_SCRATCH_AREA_LIMIT) { + pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", + len, GHCB_SCRATCH_AREA_LIMIT); + return false; + } + scratch_va = kzalloc(len, GFP_KERNEL); + if (!scratch_va) + return false; + + if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { + /* Unable to copy scratch area from guest */ + pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); + + kfree(scratch_va); + return false; + } + + /* + * The scratch area is outside the GHCB. The operation will + * dictate whether the buffer needs to be synced before running + * the vCPU next time (i.e. a read was requested so the data + * must be written back to the guest memory). + */ + svm->ghcb_sa_sync = sync; + svm->ghcb_sa_free = true; + } + + svm->ghcb_sa = scratch_va; + svm->ghcb_sa_len = len; + + return true; +} + +static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, + unsigned int pos) +{ + svm->vmcb->control.ghcb_gpa &= ~(mask << pos); + svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; +} + +static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) +{ + return (svm->vmcb->control.ghcb_gpa >> pos) & mask; +} + +static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) +{ + svm->vmcb->control.ghcb_gpa = value; +} + +static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 ghcb_info; + int ret = 1; + + ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; + + trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, + control->ghcb_gpa); + + switch (ghcb_info) { + case GHCB_MSR_SEV_INFO_REQ: + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + GHCB_VERSION_MIN, + sev_enc_bit)); + break; + case GHCB_MSR_CPUID_REQ: { + u64 cpuid_fn, cpuid_reg, cpuid_value; + + cpuid_fn = get_ghcb_msr_bits(svm, + GHCB_MSR_CPUID_FUNC_MASK, + GHCB_MSR_CPUID_FUNC_POS); + + /* Initialize the registers needed by the CPUID intercept */ + vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; + vcpu->arch.regs[VCPU_REGS_RCX] = 0; + + ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID); + if (!ret) { + ret = -EINVAL; + break; + } + + cpuid_reg = get_ghcb_msr_bits(svm, + GHCB_MSR_CPUID_REG_MASK, + GHCB_MSR_CPUID_REG_POS); + if (cpuid_reg == 0) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; + else if (cpuid_reg == 1) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; + else if (cpuid_reg == 2) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; + else + cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; + + set_ghcb_msr_bits(svm, cpuid_value, + GHCB_MSR_CPUID_VALUE_MASK, + GHCB_MSR_CPUID_VALUE_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + + reason_set = get_ghcb_msr_bits(svm, + GHCB_MSR_TERM_REASON_SET_MASK, + GHCB_MSR_TERM_REASON_SET_POS); + reason_code = get_ghcb_msr_bits(svm, + GHCB_MSR_TERM_REASON_MASK, + GHCB_MSR_TERM_REASON_POS); + pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", + reason_set, reason_code); + fallthrough; + } + default: + ret = -EINVAL; + } + + trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, + control->ghcb_gpa, ret); + + return ret; +} + +int sev_handle_vmgexit(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + u64 ghcb_gpa, exit_code; + struct ghcb *ghcb; + int ret; + + /* Validate the GHCB */ + ghcb_gpa = control->ghcb_gpa; + if (ghcb_gpa & GHCB_MSR_INFO_MASK) + return sev_handle_vmgexit_msr_protocol(svm); + + if (!ghcb_gpa) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n"); + return -EINVAL; + } + + if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + /* Unable to map GHCB from guest */ + vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", + ghcb_gpa); + return -EINVAL; + } + + svm->ghcb = svm->ghcb_map.hva; + ghcb = svm->ghcb_map.hva; + + trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb); + + exit_code = ghcb_get_sw_exit_code(ghcb); + + ret = sev_es_validate_vmgexit(svm); + if (ret) + return ret; + + sev_es_sync_from_ghcb(svm); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + ret = -EINVAL; + switch (exit_code) { + case SVM_VMGEXIT_MMIO_READ: + if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) + break; + + ret = kvm_sev_es_mmio_read(&svm->vcpu, + control->exit_info_1, + control->exit_info_2, + svm->ghcb_sa); + break; + case SVM_VMGEXIT_MMIO_WRITE: + if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) + break; + + ret = kvm_sev_es_mmio_write(&svm->vcpu, + control->exit_info_1, + control->exit_info_2, + svm->ghcb_sa); + break; + case SVM_VMGEXIT_NMI_COMPLETE: + ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); + break; + case SVM_VMGEXIT_AP_HLT_LOOP: + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + break; + case SVM_VMGEXIT_AP_JUMP_TABLE: { + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + + switch (control->exit_info_1) { + case 0: + /* Set AP jump table address */ + sev->ap_jump_table = control->exit_info_2; + break; + case 1: + /* Get AP jump table address */ + ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); + break; + default: + pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", + control->exit_info_1); + ghcb_set_sw_exit_info_1(ghcb, 1); + ghcb_set_sw_exit_info_2(ghcb, + X86_TRAP_UD | + SVM_EVTINJ_TYPE_EXEPT | + SVM_EVTINJ_VALID); + } + + ret = 1; + break; + } + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(&svm->vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", + control->exit_info_1, control->exit_info_2); + break; + default: + ret = svm_invoke_exit_handler(svm, exit_code); + } + + return ret; +} + +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) +{ + if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + return -EINVAL; + + return kvm_sev_es_string_io(&svm->vcpu, size, port, + svm->ghcb_sa, svm->ghcb_sa_len, in); +} + +void sev_es_init_vmcb(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; + + /* + * An SEV-ES guest requires a VMSA area that is a separate from the + * VMCB page. Do not include the encryption mask on the VMSA physical + * address since hardware will access it using the guest key. + */ + svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + + /* Can't intercept CR register access, HV can't modify CR registers */ + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR4_READ); + svm_clr_intercept(svm, INTERCEPT_CR8_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); + + svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); + + /* Track EFER/CR register changes */ + svm_set_intercept(svm, TRAP_EFER_WRITE); + svm_set_intercept(svm, TRAP_CR0_WRITE); + svm_set_intercept(svm, TRAP_CR4_WRITE); + svm_set_intercept(svm, TRAP_CR8_WRITE); + + /* No support for enable_vmware_backdoor */ + clr_exception_intercept(svm, GP_VECTOR); + + /* Can't intercept XSETBV, HV can't modify XCR0 directly */ + svm_clr_intercept(svm, INTERCEPT_XSETBV); + + /* Clear intercepts on selected MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +void sev_es_create_vcpu(struct vcpu_svm *svm) +{ + /* + * Set the GHCB MSR value as per the GHCB specification when creating + * a vCPU for an SEV-ES guest. + */ + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + GHCB_VERSION_MIN, + sev_enc_bit)); +} + +void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct vmcb_save_area *hostsa; + + /* + * As an SEV-ES guest, hardware will restore the host state on VMEXIT, + * of which one step is to perform a VMLOAD. Since hardware does not + * perform a VMSAVE on VMRUN, the host savearea must be updated. + */ + vmsave(__sme_page_pa(sd->save_area)); + + /* XCR0 is restored on VMEXIT, save the current host value */ + hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + + /* PKRU is restored on VMEXIT, save the curent host value */ + hostsa->pkru = read_pkru(); + + /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ + hostsa->xss = host_xss; +} + +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* First SIPI: Use the values as initially set by the VMM */ + if (!svm->received_first_sipi) { + svm->received_first_sipi = true; + return; + } + + /* + * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where + * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a + * non-zero value. + */ + ghcb_set_sw_exit_info_2(svm->ghcb, 1); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index da7eb4aaf44f..58a45bb139f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -33,14 +33,15 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> -#include <asm/mce.h> #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> +#include <asm/traps.h> #include <asm/virtext.h> #include "trace.h" #include "svm.h" +#include "svm_ops.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -90,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ - bool always; /* True if intercept is always on */ + bool always; /* True if intercept is initially cleared */ } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, @@ -108,16 +109,12 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_EFER, .always = false }, + { .index = MSR_IA32_CR_PAT, .always = false }, + { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -166,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -187,15 +187,19 @@ static int vgif = true; module_param(vgif, int, 0444); /* enable/disable SEV support */ -static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); module_param(sev, int, 0444); -static bool __read_mostly dump_invalid_vmcb = 0; +/* enable/disable SEV-ES support */ +int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +module_param(sev_es, int, 0444); + +bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); -static u8 rsm_ins_bytes[] = "\x0f\xaa"; +static bool svm_gp_erratum_intercept = true; -static void svm_complete_interrupts(struct vcpu_svm *svm); +static u8 rsm_ins_bytes[] = "\x0f\xaa"; static unsigned long iopm_base; @@ -239,21 +243,6 @@ u32 svm_msrpm_offset(u32 msr) #define MAX_INST_SIZE 15 -static inline void clgi(void) -{ - asm volatile (__ex("clgi")); -} - -static inline void stgi(void) -{ - asm volatile (__ex("stgi")); -} - -static inline void invlpga(unsigned long addr, u32 asid) -{ - asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); -} - static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 @@ -281,6 +270,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!(efer & EFER_SVME)) { svm_leave_nested(svm); svm_set_gif(svm, true); + /* #GP intercept is still needed for vmware backdoor */ + if (!enable_vmware_backdoor) + clr_exception_intercept(svm, GP_VECTOR); /* * Free the nested guest state, unless we are in SMM. @@ -297,6 +289,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.efer = old_efer; return ret; } + + if (svm_gp_erratum_intercept) + set_exception_intercept(svm, GP_VECTOR); } } @@ -336,6 +331,13 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * SEV-ES does not expose the next RIP. The RIP update is controlled by + * the type of exit and the #VC handler in the guest. + */ + if (sev_es_guest(vcpu->kvm)) + goto done; + if (nrips && svm->vmcb->control.next_rip != 0) { WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; @@ -347,6 +349,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } else { kvm_rip_write(vcpu, svm->next_rip); } + +done: svm_set_interrupt_shadow(vcpu, 0); return 1; @@ -438,6 +442,11 @@ static int has_svm(void) return 0; } + if (sev_active()) { + pr_info("KVM is unsupported when running as an SEV guest\n"); + return 0; + } + return 1; } @@ -484,7 +493,7 @@ static int svm_hardware_enable(void) wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); @@ -552,6 +561,7 @@ static int svm_cpu_init(int cpu) sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) goto free_cpu_data; + clear_page(page_address(sd->save_area)); if (svm_sev_enabled()) { sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, @@ -662,8 +672,8 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; } -static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { set_shadow_msr_intercept(vcpu, msr, read, write); set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); @@ -903,15 +913,15 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* Enable INVPCID feature */ - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) @@ -959,15 +969,11 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } - if (sev) { - if (boot_cpu_has(X86_FEATURE_SEV) && - IS_ENABLED(CONFIG_KVM_AMD_SEV)) { - r = sev_hardware_setup(); - if (r) - sev = false; - } else { - sev = false; - } + if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) { + sev_hardware_setup(); + } else { + sev = false; + sev_es = false; } svm_adjust_mmio_mask(); @@ -978,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); @@ -1014,6 +1025,9 @@ static __init int svm_hardware_setup(void) } } + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + if (vgif) { if (!boot_cpu_has(X86_FEATURE_VGIF)) vgif = false; @@ -1087,12 +1101,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static void svm_check_invpcid(struct vcpu_svm *svm) { /* - * Intercept INVPCID instruction only if shadow page table is - * enabled. Interception is not required with nested page table - * enabled. + * Intercept INVPCID if shadow paging is enabled to sync/free shadow + * roots, or if INVPCID is disabled in the guest to inject #UD. */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { - if (!npt_enabled) + if (!npt_enabled || + !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); @@ -1187,9 +1201,10 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + svm_set_cr4(&svm->vcpu, 0); svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; - kvm_set_rflags(&svm->vcpu, 2); + kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1215,6 +1230,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->cr4 = 0; } svm->asid_generation = 0; + svm->asid = 0; svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; @@ -1252,6 +1268,11 @@ static void init_vmcb(struct vcpu_svm *svm) if (sev_guest(svm->vcpu.kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + + if (sev_es_guest(svm->vcpu.kvm)) { + /* Perform SEV-ES specific VMCB updates */ + sev_es_init_vmcb(svm); + } } vmcb_mark_all_dirty(svm->vmcb); @@ -1288,6 +1309,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb_page; + struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1298,9 +1320,27 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (!vmcb_page) goto out; + if (sev_es_guest(svm->vcpu.kvm)) { + /* + * SEV-ES guests require a separate VMSA page used to contain + * the encrypted register state of the guest. + */ + vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmsa_page) + goto error_free_vmcb_page; + + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Free the fpu structure to prevent KVM from attempting to + * access the FPU state. + */ + kvm_free_guest_fpu(vcpu); + } + err = avic_init_vcpu(svm); if (err) - goto error_free_vmcb_page; + goto error_free_vmsa_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1311,21 +1351,33 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmcb_page; + goto error_free_vmsa_page; } svm_vcpu_init_msrpm(vcpu, svm->msrpm); svm->vmcb = page_address(vmcb_page); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); + + if (vmsa_page) + svm->vmsa = page_address(vmsa_page); + svm->asid_generation = 0; + svm->guest_state_loaded = false; init_vmcb(svm); svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; + if (sev_es_guest(svm->vcpu.kvm)) + /* Perform SEV-ES specific VMCB creation updates */ + sev_es_create_vcpu(svm); + return 0; +error_free_vmsa_page: + if (vmsa_page) + __free_page(vmsa_page); error_free_vmcb_page: __free_page(vmcb_page); out: @@ -1353,31 +1405,39 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) svm_free_nested(svm); + sev_free_vcpu(vcpu); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); } -static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - int i; - - if (unlikely(cpu != vcpu->cpu)) { - svm->asid_generation = 0; - vmcb_mark_all_dirty(svm->vmcb); - } + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + unsigned int i; -#ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); -#endif - savesegment(fs, svm->host.fs); - savesegment(gs, svm->host.gs); - svm->host.ldt = kvm_read_ldt(); + if (svm->guest_state_loaded) + return; + /* + * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save + * area (non-sev-es). Save ones that aren't so we can restore them + * individually later. + */ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + /* + * Save additional host state that will be restored on VMEXIT (sev-es) + * or subsequent vmload of host save area. + */ + if (sev_es_guest(svm->vcpu.kvm)) { + sev_es_prepare_guest_switch(svm, vcpu->cpu); + } else { + vmsave(__sme_page_pa(sd->save_area)); + } + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { @@ -1385,10 +1445,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); } } + /* This assumes that the kernel never uses MSR_TSC_AUX */ if (static_cpu_has(X86_FEATURE_RDTSCP)) wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + svm->guest_state_loaded = true; +} + +static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + unsigned int i; + + if (!svm->guest_state_loaded) + return; + + /* + * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save + * area (non-sev-es). Restore the ones that weren't. + */ + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + svm->guest_state_loaded = false; +} + +static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + + if (unlikely(cpu != vcpu->cpu)) { + svm->asid_generation = 0; + vmcb_mark_all_dirty(svm->vmcb); + } + if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); @@ -1398,24 +1490,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - int i; - avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; - kvm_load_ldt(svm->host.ldt); -#ifdef CONFIG_X86_64 - loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); - load_gs_index(svm->host.gs); -#else -#ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -1633,9 +1711,18 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) static void update_cr0_intercept(struct vcpu_svm *svm) { - ulong gcr0 = svm->vcpu.arch.cr0; - u64 *hcr0 = &svm->vmcb->save.cr0; + ulong gcr0; + u64 *hcr0; + + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(svm->vcpu.kvm)) + return; + gcr0 = svm->vcpu.arch.cr0; + hcr0 = &svm->vmcb->save.cr0; *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); @@ -1655,7 +1742,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; @@ -1684,13 +1771,15 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) update_cr0_intercept(svm); } -int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; - unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + return true; +} - if (cr4 & X86_CR4_VMXE) - return 1; +void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; + unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) svm_flush_tlb(vcpu); @@ -1701,7 +1790,9 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); - return 0; + + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + kvm_update_cpuid_runtime(vcpu); } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1735,7 +1826,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) +static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1753,18 +1844,20 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) ++sd->asid_generation; sd->next_asid = sd->min_asid; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } svm->asid_generation = sd->asid_generation; - svm->vmcb->control.asid = sd->next_asid++; - - vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + svm->asid = sd->next_asid++; } static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) { struct vmcb *vmcb = svm->vmcb; + if (svm->vcpu.arch.guest_state_protected) + return; + if (unlikely(value != vmcb->save.dr6)) { vmcb->save.dr6 = value; vmcb_mark_dirty(vmcb, VMCB_DR); @@ -1775,12 +1868,15 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (vcpu->arch.guest_state_protected) + return; + get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); /* - * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here, + * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, * because db_interception might need it. We can do it before vmentry. */ vcpu->arch.dr6 = svm->vmcb->save.dr6; @@ -1793,6 +1889,9 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); + if (vcpu->arch.guest_state_protected) + return; + svm->vmcb->save.dr7 = value; vmcb_mark_dirty(svm->vmcb, VMCB_DR); } @@ -1828,7 +1927,7 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { - u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1; + u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); return 1; } @@ -1874,24 +1973,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static int gp_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - u32 error_code = svm->vmcb->control.exit_info_1; - - WARN_ON_ONCE(!enable_vmware_backdoor); - - /* - * VMware backdoor emulation on #GP interception only handles IN{S}, - * OUT{S}, and RDPMC, none of which generate a non-zero error code. - */ - if (error_code) { - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; - } - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); -} - static bool is_erratum_383(void) { int err, i; @@ -1931,25 +2012,6 @@ static bool is_erratum_383(void) return true; } -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s); -#endif -} - static void svm_handle_mce(struct vcpu_svm *svm) { if (is_erratum_383()) { @@ -1981,6 +2043,13 @@ static int shutdown_interception(struct vcpu_svm *svm) struct kvm_run *kvm_run = svm->vcpu.run; /* + * The VM save area has already been encrypted so it + * cannot be reinitialized - just terminate. + */ + if (sev_es_guest(svm->vcpu.kvm)) + return -EINVAL; + + /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. */ @@ -2001,11 +2070,16 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string) - return kvm_emulate_instruction(vcpu, 0); - port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + + if (string) { + if (sev_es_guest(vcpu->kvm)) + return sev_es_string_io(svm, size, port, in); + else + return kvm_emulate_instruction(vcpu, 0); + } + svm->next_rip = svm->vmcb->control.exit_info_2; return kvm_fast_pio(&svm->vcpu, size, port, in); @@ -2097,6 +2171,107 @@ static int vmrun_interception(struct vcpu_svm *svm) return nested_svm_vmrun(svm); } +enum { + NONE_SVM_INSTR, + SVM_INSTR_VMRUN, + SVM_INSTR_VMLOAD, + SVM_INSTR_VMSAVE, +}; + +/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ +static int svm_instr_opcode(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + if (ctxt->b != 0x1 || ctxt->opcode_len != 2) + return NONE_SVM_INSTR; + + switch (ctxt->modrm) { + case 0xd8: /* VMRUN */ + return SVM_INSTR_VMRUN; + case 0xda: /* VMLOAD */ + return SVM_INSTR_VMLOAD; + case 0xdb: /* VMSAVE */ + return SVM_INSTR_VMSAVE; + default: + break; + } + + return NONE_SVM_INSTR; +} + +static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) +{ + const int guest_mode_exit_codes[] = { + [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, + [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, + [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, + }; + int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = { + [SVM_INSTR_VMRUN] = vmrun_interception, + [SVM_INSTR_VMLOAD] = vmload_interception, + [SVM_INSTR_VMSAVE] = vmsave_interception, + }; + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (is_guest_mode(vcpu)) { + svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + /* Returns '1' or -errno on failure, '0' on success. */ + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + return 1; + } + return svm_instr_handlers[opcode](svm); +} + +/* + * #GP handling code. Note that #GP can be triggered under the following two + * cases: + * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on + * some AMD CPUs when EAX of these instructions are in the reserved memory + * regions (e.g. SMM memory on host). + * 2) VMware backdoor + */ +static int gp_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 error_code = svm->vmcb->control.exit_info_1; + int opcode; + + /* Both #GP cases have zero error_code */ + if (error_code) + goto reinject; + + /* Decode the instruction for usage later */ + if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) + goto reinject; + + opcode = svm_instr_opcode(vcpu); + + if (opcode == NONE_SVM_INSTR) { + if (!enable_vmware_backdoor) + goto reinject; + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC. + */ + if (!is_guest_mode(vcpu)) + return kvm_emulate_instruction(vcpu, + EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); + } else + return emulate_svm_instr(vcpu, opcode); + +reinject: + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; +} + void svm_set_gif(struct vcpu_svm *svm, bool value) { if (value) { @@ -2184,11 +2359,8 @@ static int xsetbv_interception(struct vcpu_svm *svm) u64 new_bv = kvm_read_edx_eax(&svm->vcpu); u32 index = kvm_rcx_read(&svm->vcpu); - if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - return kvm_skip_emulated_instruction(&svm->vcpu); - } - - return 1; + int err = kvm_set_xcr(&svm->vcpu, index, new_bv); + return kvm_complete_insn_gp(&svm->vcpu, err); } static int rdpru_interception(struct vcpu_svm *svm) @@ -2269,9 +2441,11 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - svm_clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; - svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + if (!sev_es_guest(svm->vcpu.kvm)) { + svm_clr_intercept(svm, INTERCEPT_IRET); + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + } kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2408,10 +2582,46 @@ static int cr_interception(struct vcpu_svm *svm) return kvm_complete_insn_gp(&svm->vcpu, err); } +static int cr_trap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long old_value, new_value; + unsigned int cr; + int ret = 0; + + new_value = (unsigned long)svm->vmcb->control.exit_info_1; + + cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; + switch (cr) { + case 0: + old_value = kvm_read_cr0(vcpu); + svm_set_cr0(vcpu, new_value); + + kvm_post_set_cr0(vcpu, old_value, new_value); + break; + case 4: + old_value = kvm_read_cr4(vcpu); + svm_set_cr4(vcpu, new_value); + + kvm_post_set_cr4(vcpu, old_value, new_value); + break; + case 8: + ret = kvm_set_cr8(&svm->vcpu, new_value); + break; + default: + WARN(1, "unhandled CR%d write trap", cr); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + return kvm_complete_insn_gp(vcpu, ret); +} + static int dr_interception(struct vcpu_svm *svm) { int reg, dr; unsigned long val; + int err = 0; if (svm->vcpu.guest_debug == 0) { /* @@ -2429,20 +2639,16 @@ static int dr_interception(struct vcpu_svm *svm) reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; - - if (dr >= 16) { /* mov to DRn */ - if (!kvm_require_dr(&svm->vcpu, dr - 16)) - return 1; + if (dr >= 16) { /* mov to DRn */ + dr -= 16; val = kvm_register_read(&svm->vcpu, reg); - kvm_set_dr(&svm->vcpu, dr - 16, val); + err = kvm_set_dr(&svm->vcpu, dr, val); } else { - if (!kvm_require_dr(&svm->vcpu, dr)) - return 1; kvm_get_dr(&svm->vcpu, dr, &val); kvm_register_write(&svm->vcpu, reg, val); } - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_complete_insn_gp(&svm->vcpu, err); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -2461,6 +2667,25 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +static int efer_trap(struct vcpu_svm *svm) +{ + struct msr_data msr_info; + int ret; + + /* + * Clear the EFER_SVME bit from EFER. The SVM code always sets this + * bit in svm_set_efer(), but __kvm_valid_efer() checks it against + * whether the guest has X86_FEATURE_SVM - this avoids a failure if + * the guest doesn't have X86_FEATURE_SVM. + */ + msr_info.host_initiated = false; + msr_info.index = MSR_EFER; + msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME; + ret = kvm_set_msr_common(&svm->vcpu, &msr_info); + + return kvm_complete_insn_gp(&svm->vcpu, ret); +} + static int svm_get_msr_feature(struct kvm_msr_entry *msr) { msr->data = 0; @@ -2543,10 +2768,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = svm->spec_ctrl; @@ -2584,6 +2806,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } +static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!sev_es_guest(svm->vcpu.kvm) || !err) + return kvm_complete_insn_gp(&svm->vcpu, err); + + ghcb_set_sw_exit_info_1(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->ghcb, + X86_TRAP_GP | + SVM_EVTINJ_TYPE_EXEPT | + SVM_EVTINJ_VALID); + return 1; +} + static int rdmsr_interception(struct vcpu_svm *svm) { return kvm_emulate_rdmsr(&svm->vcpu); @@ -2630,10 +2866,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2658,12 +2891,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; @@ -2805,7 +3038,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm) static int pause_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - bool in_kernel = (svm_get_cpl(vcpu) == 0); + bool in_kernel; + + /* + * CPL is not made available for an SEV-ES guest, therefore + * vcpu->arch.preempted_in_kernel can never be true. Just + * set in_kernel to false as well. + */ + in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0; if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); @@ -2920,11 +3160,16 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, + [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, + [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, + [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, + [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -2966,6 +3211,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); + pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); @@ -2973,6 +3219,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); + pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3045,6 +3292,43 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ + if (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]) + return 0; + + vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); + dump_vmcb(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + + return -EINVAL; +} + +int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code) +{ + if (svm_handle_invalid_exit(&svm->vcpu, exit_code)) + return 0; + +#ifdef CONFIG_RETPOLINE + if (exit_code == SVM_EXIT_MSR) + return msr_interception(svm); + else if (exit_code == SVM_EXIT_VINTR) + return interrupt_window_interception(svm); + else if (exit_code == SVM_EXIT_INTR) + return intr_interception(svm); + else if (exit_code == SVM_EXIT_HLT) + return halt_interception(svm); + else if (exit_code == SVM_EXIT_NPF) + return npf_interception(svm); +#endif + return svm_exit_handlers[exit_code](svm); +} + static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { @@ -3068,10 +3352,13 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; + /* SEV-ES guests must use the CR write traps to track CR registers. */ + if (!sev_es_guest(vcpu->kvm)) { + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; + } if (is_guest_mode(vcpu)) { int vmexit; @@ -3108,32 +3395,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || !svm_exit_handlers[exit_code]) { - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); - dump_vmcb(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - return 0; - } - -#ifdef CONFIG_RETPOLINE - if (exit_code == SVM_EXIT_MSR) - return msr_interception(svm); - else if (exit_code == SVM_EXIT_VINTR) - return interrupt_window_interception(svm); - else if (exit_code == SVM_EXIT_INTR) - return intr_interception(svm); - else if (exit_code == SVM_EXIT_HLT) - return halt_interception(svm); - else if (exit_code == SVM_EXIT_NPF) - return npf_interception(svm); -#endif - return svm_exit_handlers[exit_code](svm); + return svm_invoke_exit_handler(svm, exit_code); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -3162,7 +3424,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - svm_set_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3179,10 +3442,17 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(vcpu->kvm)) + return; + if (nested_svm_virtualize_tpr(vcpu)) return; @@ -3239,10 +3509,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - svm_set_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - svm_clr_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3254,7 +3526,14 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (is_guest_mode(vcpu)) { + if (sev_es_guest(svm->vcpu.kvm)) { + /* + * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask + * bit to determine the state of the IF flag. + */ + if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) + return true; + } else if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) @@ -3288,7 +3567,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !svm_interrupt_blocked(vcpu); } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static void svm_enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3312,7 +3591,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) } } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3369,10 +3648,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3413,8 +3688,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((svm->vcpu.arch.hflags & HF_IRET_MASK) - && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && + (sev_es_guest(svm->vcpu.kvm) || + kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } @@ -3437,6 +3713,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) break; case SVM_EXITINTINFO_TYPE_EXEPT: /* + * Never re-inject a #VC exception. + */ + if (vector == X86_TRAP_VC) + break; + + /* * In case of software exceptions, do not reinject the vector, * but re-execute the instruction instead. Rewind RIP first * if we emulated INT3 before. @@ -3484,8 +3766,6 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); - static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) { @@ -3509,16 +3789,15 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + if (sev_es_guest(svm->vcpu.kvm)) { + __svm_sev_es_vcpu_run(svm->vmcb_pa); + } else { + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); -#ifdef CONFIG_X86_64 - native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + + vmload(__sme_page_pa(sd->save_area)); + } /* * VMEXIT disables interrupts (host state), but tracing and lockdep @@ -3544,6 +3823,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + trace_kvm_entry(vcpu); + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; @@ -3568,6 +3849,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); + if (unlikely(svm->asid != svm->vmcb->control.asid)) { + svm->vmcb->control.asid = svm->asid; + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + } svm->vmcb->save.cr2 = vcpu->arch.cr2; /* @@ -3577,7 +3862,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) svm_set_dr6(svm, vcpu->arch.dr6); else - svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM); + svm_set_dr6(svm, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -3612,14 +3897,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - reload_tss(vcpu); + if (!sev_es_guest(svm->vcpu.kvm)) + reload_tss(vcpu); x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - vcpu->arch.cr2 = svm->vmcb->save.cr2; - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (!sev_es_guest(svm->vcpu.kvm)) { + vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); @@ -3722,12 +4010,21 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_emulated_msr(u32 index) +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; + case MSR_IA32_SMBASE: + /* SEV-ES guests do not support SMM, so report false */ + if (kvm && sev_es_guest(kvm)) + return false; + break; default: break; } @@ -3760,7 +4057,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); if (best) - vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f)); + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } if (!kvm_vcpu_apicv_active(vcpu)) @@ -4067,7 +4364,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return ret; } -static void enable_smi_window(struct kvm_vcpu *vcpu) +static void svm_enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4086,6 +4383,12 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i unsigned long cr4; /* + * When the guest is an SEV-ES guest, emulation is not possible. + */ + if (sev_es_guest(vcpu->kvm)) + return false; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: @@ -4165,6 +4468,14 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } +static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + if (!sev_es_guest(vcpu->kvm)) + return kvm_vcpu_deliver_sipi_vector(vcpu, vector); + + sev_vcpu_deliver_sipi_vector(vcpu, vector); +} + static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); @@ -4207,7 +4518,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_blocking = svm_vcpu_blocking, .vcpu_unblocking = svm_vcpu_unblocking, - .update_exception_bitmap = update_exception_bitmap, + .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, @@ -4217,6 +4528,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, .get_idt = svm_get_idt, @@ -4249,9 +4561,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, .set_nmi_mask = svm_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, + .enable_nmi_window = svm_enable_nmi_window, + .enable_irq_window = svm_enable_irq_window, + .update_cr8_intercept = svm_update_cr8_intercept, .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, @@ -4294,7 +4606,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .smi_allowed = svm_smi_allowed, .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .enable_smi_window = svm_enable_smi_window, .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, @@ -4305,6 +4617,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, .msr_filter_changed = svm_msr_filter_changed, + .complete_emulated_msr = svm_complete_emulated_msr, + + .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1d853fe4c778..39e071fdab0c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -17,21 +17,18 @@ #include <linux/kvm_types.h> #include <linux/kvm_host.h> +#include <linux/bits.h> #include <asm/svm.h> +#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) + static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_TSC_AUX, }; - #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) -#define MAX_DIRECT_ACCESS_MSRS 15 +#define MAX_DIRECT_ACCESS_MSRS 18 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -61,11 +58,13 @@ enum { struct kvm_sev_info { bool active; /* SEV enabled guest */ + bool es_active; /* SEV-ES enabled guest */ unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ + u64 ap_jump_table; /* SEV-ES AP Jump Table address */ }; struct kvm_svm { @@ -106,6 +105,7 @@ struct vcpu_svm { struct vmcb *vmcb; unsigned long vmcb_pa; struct svm_cpu_data *svm_data; + u32 asid; uint64_t asid_generation; uint64_t sysenter_esp; uint64_t sysenter_eip; @@ -116,12 +116,6 @@ struct vcpu_svm { u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; u64 spec_ctrl; /* @@ -166,6 +160,20 @@ struct vcpu_svm { DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; + + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u64 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; + + bool guest_state_loaded; }; struct svm_cpu_data { @@ -193,6 +201,28 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } +static inline bool sev_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev->active; +#else + return false; +#endif +} + +static inline bool sev_es_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev_guest(kvm) && sev->es_active; +#else + return false; +#endif +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -244,21 +274,24 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + if (!sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + } + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); @@ -270,6 +303,12 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) vmcb->control.intercepts[INTERCEPT_DR] = 0; + /* DR7 access must remain intercepted for an SEV-ES guest */ + if (sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + } + recalc_intercepts(svm); } @@ -346,11 +385,12 @@ static inline bool gif_set(struct vcpu_svm *svm) } /* svm.c */ -#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U -#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU +extern int sev; +extern int sev_es; +extern bool dump_invalid_vmcb; + u32 svm_msrpm_offset(u32 msr); u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); @@ -358,13 +398,16 @@ void svm_vcpu_free_msrpm(u32 *msrpm); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); void svm_set_gif(struct vcpu_svm *svm, bool value); +int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code); +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write); /* nested.c */ @@ -470,18 +513,42 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -extern unsigned int max_sev_asid; +#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_MSR_INFO_POS 0 +#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) + +#define GHCB_MSR_SEV_INFO_RESP 0x001 +#define GHCB_MSR_SEV_INFO_REQ 0x002 +#define GHCB_MSR_VER_MAX_POS 48 +#define GHCB_MSR_VER_MAX_MASK 0xffff +#define GHCB_MSR_VER_MIN_POS 32 +#define GHCB_MSR_VER_MIN_MASK 0xffff +#define GHCB_MSR_CBIT_POS 24 +#define GHCB_MSR_CBIT_MASK 0xff +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ + (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ + (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + GHCB_MSR_SEV_INFO_RESP) + +#define GHCB_MSR_CPUID_REQ 0x004 +#define GHCB_MSR_CPUID_RESP 0x005 +#define GHCB_MSR_CPUID_FUNC_POS 32 +#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff +#define GHCB_MSR_CPUID_VALUE_POS 32 +#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff +#define GHCB_MSR_CPUID_REG_POS 30 +#define GHCB_MSR_CPUID_REG_MASK 0x3 + +#define GHCB_MSR_TERM_REQ 0x100 +#define GHCB_MSR_TERM_REASON_SET_POS 12 +#define GHCB_MSR_TERM_REASON_SET_MASK 0xf +#define GHCB_MSR_TERM_REASON_POS 16 +#define GHCB_MSR_TERM_REASON_MASK 0xff -static inline bool sev_guest(struct kvm *kvm) -{ -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif -} +extern unsigned int max_sev_asid; static inline bool svm_sev_enabled(void) { @@ -495,7 +562,19 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); void pre_sev_run(struct vcpu_svm *svm, int cpu); -int __init sev_hardware_setup(void); +void __init sev_hardware_setup(void); void sev_hardware_teardown(void); +void sev_free_vcpu(struct kvm_vcpu *vcpu); +int sev_handle_vmgexit(struct vcpu_svm *svm); +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); +void sev_es_init_vmcb(struct vcpu_svm *svm); +void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); + +/* vmenter.S */ + +void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); +void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); #endif diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h new file mode 100644 index 000000000000..8170f2a5a16f --- /dev/null +++ b/arch/x86/kvm/svm/svm_ops.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SVM_OPS_H +#define __KVM_X86_SVM_OPS_H + +#include <linux/compiler_types.h> + +#include <asm/kvm_host.h> + +#define svm_asm(insn, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + ::: clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define svm_asm1(insn, op1, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + :: op1 : clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define svm_asm2(insn, op1, op2, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + :: op1, op2 : clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +static inline void clgi(void) +{ + svm_asm(clgi); +} + +static inline void stgi(void) +{ + svm_asm(stgi); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + svm_asm2(invlpga, "c"(asid), "a"(addr)); +} + +/* + * Despite being a physical address, the portion of rAX that is consumed by + * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, + * hence 'unsigned long' instead of 'hpa_t'. + */ +static inline void vmsave(unsigned long pa) +{ + svm_asm1(vmsave, "a" (pa), "memory"); +} + +static inline void vmload(unsigned long pa) +{ + svm_asm1(vmload, "a" (pa), "memory"); +} + +#endif /* __KVM_X86_SVM_OPS_H */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 1ec1ac40e328..6feb8c08f45a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -168,3 +168,53 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_BP ret SYM_FUNC_END(__svm_vcpu_run) + +/** + * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode + * @vmcb_pa: unsigned long + */ +SYM_FUNC_START(__svm_sev_es_vcpu_run) + push %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX + + /* Enter guest mode */ + mov %_ASM_ARG1, %_ASM_AX + sti + +1: vmrun %_ASM_AX + jmp 3f +2: cmpb $0, kvm_rebooting + jne 3f + ud2 + _ASM_EXTABLE(1b, 2b) + +3: cli + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif + pop %_ASM_BP + ret +SYM_FUNC_END(__svm_sev_es_vcpu_run) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index aef960f90f26..a61c015870e3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -93,6 +93,42 @@ TRACE_EVENT(kvm_hv_hypercall, ); /* + * Tracepoint for Xen hypercall. + */ +TRACE_EVENT(kvm_xen_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, unsigned long a4, + unsigned long a5), + TP_ARGS(nr, a0, a1, a2, a3, a4, a5), + + TP_STRUCT__entry( + __field(unsigned long, nr) + __field(unsigned long, a0) + __field(unsigned long, a1) + __field(unsigned long, a2) + __field(unsigned long, a3) + __field(unsigned long, a4) + __field(unsigned long, a5) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + __entry->a4 = a4; + __entry->a4 = a5; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3, __entry->a4, __entry->a5) +); + + + +/* * Tracepoint for PIO. */ @@ -256,7 +292,7 @@ TRACE_EVENT(name, \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ @@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS); + __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS); __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr - vcpu->arch.emulate_ctxt->fetch.data; __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; @@ -1578,6 +1614,103 @@ TRACE_EVENT(kvm_hv_syndbg_get_msr, __entry->vcpu_id, __entry->vp_index, __entry->msr, __entry->data) ); + +/* + * Tracepoint for the start of VMGEXIT processing + */ +TRACE_EVENT(kvm_vmgexit_enter, + TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), + TP_ARGS(vcpu_id, ghcb), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, exit_reason) + __field(u64, info1) + __field(u64, info2) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->exit_reason = ghcb->save.sw_exit_code; + __entry->info1 = ghcb->save.sw_exit_info_1; + __entry->info2 = ghcb->save.sw_exit_info_2; + ), + + TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", + __entry->vcpu_id, __entry->exit_reason, + __entry->info1, __entry->info2) +); + +/* + * Tracepoint for the end of VMGEXIT processing + */ +TRACE_EVENT(kvm_vmgexit_exit, + TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), + TP_ARGS(vcpu_id, ghcb), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, exit_reason) + __field(u64, info1) + __field(u64, info2) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->exit_reason = ghcb->save.sw_exit_code; + __entry->info1 = ghcb->save.sw_exit_info_1; + __entry->info2 = ghcb->save.sw_exit_info_2; + ), + + TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", + __entry->vcpu_id, __entry->exit_reason, + __entry->info1, __entry->info2) +); + +/* + * Tracepoint for the start of VMGEXIT MSR procotol processing + */ +TRACE_EVENT(kvm_vmgexit_msr_protocol_enter, + TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa), + TP_ARGS(vcpu_id, ghcb_gpa), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, ghcb_gpa) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->ghcb_gpa = ghcb_gpa; + ), + + TP_printk("vcpu %u, ghcb_gpa %016llx", + __entry->vcpu_id, __entry->ghcb_gpa) +); + +/* + * Tracepoint for the end of VMGEXIT MSR procotol processing + */ +TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, + TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result), + TP_ARGS(vcpu_id, ghcb_gpa, result), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, ghcb_gpa) + __field(int, result) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->ghcb_gpa = ghcb_gpa; + __entry->result = result; + ), + + TP_printk("vcpu %u, ghcb_gpa %016llx, result %d", + __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 3a1861403d73..d1d77985e889 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -19,6 +19,9 @@ extern int __read_mostly pt_mode; #define PT_MODE_HOST_GUEST 1 #define PMU_CAP_FW_WRITES (1ULL << 13) +#define PMU_CAP_LBR_FMT 0x3f + +#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) struct nested_vmx_msrs { /* @@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void) SECONDARY_EXEC_TSC_SCALING; } +static inline bool cpu_has_vmx_bus_lock_detection(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_BUS_LOCK_DETECTION; +} + static inline bool cpu_has_vmx_apicv(void) { return cpu_has_vmx_apic_register_virt() && @@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void) static inline u64 vmx_get_perf_capabilities(void) { + u64 perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + + perf_cap &= PMU_CAP_LBR_FMT; + /* * Since counters are virtualized, KVM would support full * width counting unconditionally, even if the host lacks it. */ - return PMU_CAP_FW_WRITES; + return PMU_CAP_FW_WRITES | perf_cap; +} + +static inline u64 vmx_supported_debugctl(void) +{ + u64 debugctl = 0; + + if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) + debugctl |= DEBUGCTLMSR_LBR_MASK; + + return debugctl; } #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index f3199bb02f22..41f24661af04 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -326,7 +326,6 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); /* * vmcs_version represents the range of supported Enlightened VMCS * versions: lower 8 bits is the minimal version, higher 8 bits is the @@ -334,7 +333,7 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) * KVM_EVMCS_VERSION. */ if (kvm_cpu_cap_get(X86_FEATURE_VMX) && - vmx->nested.enlightened_vmcs_enabled) + (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled)) return (KVM_EVMCS_VERSION << 8) | 1; return 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 89af692deb7e..bcca0b80e0d0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -12,6 +12,7 @@ #include "nested.h" #include "pmu.h" #include "trace.h" +#include "vmx.h" #include "x86.h" static bool __read_mostly enable_shadow_vmcs = 1; @@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit if (nr == DB_VECTOR) { if (!has_payload) { payload = vcpu->arch.dr6; - payload &= ~(DR6_FIXED_1 | DR6_BT); - payload ^= DR6_RTM; + payload &= ~DR6_BT; + payload ^= DR6_ACTIVE_LOW; } *exit_qual = payload; } else @@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || - CC((vmcs12->posted_intr_desc_addr & 0x3f)) || - CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) + CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { - int maxphyaddr; - if (count == 0) return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) + + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || + !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; @@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } } -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - /* * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't @@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, enum vm_entry_failure_code *entry_failure_code) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2177,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* - * The PML address never changes, so it is constant in vmcs02. - * Conceptually we want to copy the PML index from vmcs01 here, - * and then back to vmcs01 on nested vmexit. But since we flush - * the log and reset GUEST_PML_INDEX on each vmexit, the PML - * index is also effectively constant in vmcs02. + * PML is emulated for L2, but never enabled in hardware as the MMU + * handles A/D emulation. Disabling PML for L2 also avoids having to + * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(PML_ADDRESS, 0); + vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) @@ -2220,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - u32 exec_control, vmcs12_exec_ctrl; + u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) @@ -2294,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + /* PML is emulated and never enabled in hardware for L2. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; @@ -2532,7 +2520,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. */ - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -2635,7 +2623,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { @@ -2666,7 +2653,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2850,7 +2837,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) + CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || @@ -2952,7 +2939,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && + vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) return -EINVAL; return 0; @@ -3056,35 +3044,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - asm( - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" - "je 1f \n\t" - __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" - "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - - /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" - - /* - * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set - * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail - * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the - * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. - */ - "call vmx_vmenter\n\t" - - CC_SET(be) - : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : [HOST_RSP]"r"((unsigned long)HOST_RSP), - [loaded_vmcs]"r"(vmx->loaded_vmcs), - [launched]"i"(offsetof(struct loaded_vmcs, launched)), - [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), - [wordsize]"i"(sizeof(ulong)) - : "memory" - ); + vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -3123,13 +3084,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_host_map *map; - struct page *page; - u64 hpa; /* * hv_evmcs may end up being not mapped after migration (when @@ -3152,6 +3109,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } } + return true; +} + +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; + struct page *page; + u64 hpa; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3220,6 +3188,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + + return true; +} + +static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + if (!nested_get_evmcs_page(vcpu)) + return false; + + if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) + return false; + return true; } @@ -3310,7 +3290,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; bool evaluate_pending_interrupts; - u32 exit_reason, failed_index; + union vmx_exit_reason exit_reason = { + .basic = EXIT_REASON_INVALID_STATE, + .failed_vmentry = 1, + }; + u32 failed_index; if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -3362,7 +3346,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { - exit_reason = EXIT_REASON_INVALID_STATE; + exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; } @@ -3373,7 +3357,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { - exit_reason = EXIT_REASON_INVALID_STATE; + exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; } @@ -3383,7 +3367,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); if (failed_index) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; } @@ -3451,7 +3435,7 @@ vmentry_fail_vmexit: return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->vm_exit_reason = exit_reason.full; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; @@ -3559,19 +3543,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ nested_cache_shadow_vmcs12(vcpu, vmcs12); - /* - * If we're entering a halted L2 vcpu and the L2 vcpu won't be - * awakened by event injection or by an NMI-window VM-exit or - * by an interrupt-window VM-exit, halt the vcpu. - */ - if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && - (vmcs12->guest_rflags & X86_EFLAGS_IF))) { + switch (vmcs12->guest_activity_state) { + case GUEST_ACTIVITY_HLT: + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be + * awakened by event injection or by an NMI-window VM-exit or + * by an interrupt-window VM-exit, halt the vcpu. + */ + if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && + !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && + !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && + (vmcs12->guest_rflags & X86_EFLAGS_IF))) { + vmx->nested.nested_run_pending = 0; + return kvm_vcpu_halt(vcpu); + } + break; + case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + break; + default: + break; } + return 1; vmentry_failed: @@ -3797,7 +3791,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); - nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + return 0; + } + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_SIPI, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, + apic->sipi_vector & 0xFFUL); return 0; } @@ -4036,6 +4043,8 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; @@ -4189,9 +4198,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); @@ -4416,6 +4422,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + /* Service the TLB flush request for L2 before switching to L1. */ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -4482,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_set_virtual_apic_mode(vcpu); } + if (vmx->nested.update_vmcs01_cpu_dirty_logging) { + vmx->nested.update_vmcs01_cpu_dirty_logging = false; + vmx_update_cpu_dirty_logging(vcpu); + } + /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_clean(vmx->nested.apic_access_page); @@ -4814,7 +4827,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* * The Intel VMX Instruction Reference lists a bunch of bits that are * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). * Otherwise, we should fail with #UD. But most faulting conditions * have already been checked by hardware, prior to the VM-exit for * VMXON. We do test guest cr4.VMXE because processor CR4 always has @@ -5512,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); fail: - nested_vmx_vmexit(vcpu, vmx->exit_reason, + /* + * This is effectively a reflected VM-Exit, as opposed to a synthesized + * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode + * EXIT_REASON_VMFUNC as the exit reason. + */ + nested_vmx_vmexit(vcpu, vmx->exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -5580,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. */ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 exit_reason) + struct vmcs12 *vmcs12, + union vmx_exit_reason exit_reason) { u32 msr_index = kvm_rcx_read(vcpu); gpa_t bitmap; @@ -5594,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -5731,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) * Return true if L0 wants to handle an exit from L2 regardless of whether or not * L1 wants the exit. Only call this when in is_guest_mode (L2). */ -static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) { u32 intr_info; - switch ((u16)exit_reason) { + switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) @@ -5773,7 +5793,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ + /* + * PML is emulated for an L1 VMM and should never be enabled in + * vmcs02, always "handle" PML_FULL by exiting to userspace. + */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ @@ -5791,12 +5814,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) * Return 1 if L1 wants to intercept an exit from L2. Only call this when in * is_guest_mode (L2). */ -static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 intr_info; - switch ((u16)exit_reason) { + switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) @@ -5915,7 +5939,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->exit_reason; unsigned long exit_qual; u32 exit_intr_info; @@ -5934,7 +5958,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -5960,7 +5984,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); + nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); return true; } @@ -6049,11 +6073,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); - else if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); + } else { + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (!vmx->nested.need_vmcs12_to_shadow_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } } BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); @@ -6483,7 +6510,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->misc_low |= MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | - VMX_MISC_ACTIVITY_HLT; + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; /* @@ -6573,7 +6601,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_nested_state_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a886a47daebd..9efc1a6b8693 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } -static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return false; + return 0; - return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; } static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + /* + * As a first step, a guest could only enable LBR feature if its + * cpu model is the same as the host because the LBR registers + * would be pass-through to the guest and they're model specific. + */ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + + return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT); +} + +static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) +{ + struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); + bool ret = false; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return ret; + + ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) || + (index >= records->from && index < records->from + records->nr) || + (index >= records->to && index < records->to + records->nr); + + if (!ret && records->info) + ret = (index >= records->info && index < records->info + records->nr); + + return ret; +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || + intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } @@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->event) { + perf_event_release_kernel(lbr_desc->event); + lbr_desc->event = NULL; + vcpu_to_pmu(vcpu)->event_count--; + } +} + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct perf_event *event; + + /* + * The perf_event_attr is constructed in the minimum efficient way: + * - set 'pinned = true' to make it task pinned so that if another + * cpu pinned event reclaims LBR, the event->oncpu will be set to -1; + * - set '.exclude_host = true' to record guest branches behavior; + * + * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf + * schedule the event without a real HW counter but a fake one; + * check is_guest_lbr_event() and __intel_get_event_constraints(); + * + * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and + * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack + * event, which helps KVM to save/restore guest LBR records + * during host context switches and reduces quite a lot overhead, + * check branch_user_callstack() and intel_pmu_lbr_sched_task(); + */ + struct perf_event_attr attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(attr), + .config = INTEL_FIXED_VLBR_EVENT, + .sample_type = PERF_SAMPLE_BRANCH_STACK, + .pinned = true, + .exclude_host = true, + .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_USER, + }; + + if (unlikely(lbr_desc->event)) { + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; + } + + event = perf_event_create_kernel_counter(&attr, -1, + current, NULL, NULL); + if (IS_ERR(event)) { + pr_debug_ratelimited("%s: failed %ld\n", + __func__, PTR_ERR(event)); + return PTR_ERR(event); + } + lbr_desc->event = event; + pmu->event_count++; + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; +} + +/* + * It's safe to access LBR msrs from guest when they have not + * been passthrough since the host would help restore or reset + * the LBR msrs records when the guest LBR event is scheduled in. + */ +static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, bool read) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + u32 index = msr_info->index; + + if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) + return false; + + if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0) + goto dummy; + + /* + * Disable irq to ensure the LBR feature doesn't get reclaimed by the + * host at the time the value is read from the msr, and this avoids the + * host LBR value to be leaked to the guest. If LBR has been reclaimed, + * return 0 on guest reads. + */ + local_irq_disable(); + if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { + if (read) + rdmsrl(index, msr_info->data); + else + wrmsrl(index, msr_info->data); + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + return true; + } + clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + +dummy: + if (read) + msr_info->data = 0; + return true; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; return 0; - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) + return 0; } return 1; @@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reprogram_gp_counter(pmc, data); return 0; } - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) + return 0; } return 1; @@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; @@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; - vcpu->arch.perf_capabilities = 0; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -340,12 +489,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; perf_get_x86_pmu_capability(&x86_pmu); - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -355,6 +504,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, x86_pmu.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, + edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } @@ -381,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_entry_exit_ctls_update(vcpu); + + if (intel_pmu_lbr_is_compatible(vcpu)) + x86_perf_get_lbr(&lbr_desc->records); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); } static void intel_pmu_init(struct kvm_vcpu *vcpu) { int i; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; @@ -401,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; pmu->fixed_counters[i].current_config = 0; } + + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); + lbr_desc->records.nr = 0; + lbr_desc->event = NULL; + lbr_desc->msr_passthrough = false; } static void intel_pmu_reset(struct kvm_vcpu *vcpu) @@ -425,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; + + intel_pmu_release_guest_lbr_event(vcpu); +} + +/* + * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4. + * + * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and + * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL. + * + * Guest needs to re-enable LBR to resume branches recording. + */ +static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) +{ + u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + } +} + +static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu) +{ + u8 version = vcpu_to_pmu(vcpu)->version; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return; + + if (version > 1 && version < 4) + intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu); +} + +static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + int i; + + for (i = 0; i < lbr->nr; i++) { + vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set); + if (lbr->info) + vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set); + } + + vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set); +} + +static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, true); + lbr_desc->msr_passthrough = false; +} + +static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, false); + lbr_desc->msr_passthrough = true; +} + +/* + * Higher priority host perf events (e.g. cpu pinned) could reclaim the + * pmu resources (e.g. LBR) that were assigned to the guest. This is + * usually done via ipi calls (more details in perf_install_in_context). + * + * Before entering the non-root mode (with irq disabled here), double + * confirm that the pmu features enabled to the guest are not reclaimed + * by higher priority host events. Otherwise, disallow vcpu's access to + * the reclaimed features. + */ +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); + if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; + return; + } + + if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) { + vmx_disable_lbr_msrs_passthrough(vcpu); + __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + goto warn; + } else + vmx_enable_lbr_msrs_passthrough(vcpu); + + return; + +warn: + pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", + vcpu->vcpu_id); +} + +static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); } struct kvm_pmu_ops intel_pmu_ops = { @@ -441,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = { .refresh = intel_pmu_refresh, .init = intel_pmu_init, .reset = intel_pmu_reset, + .deliver_pmi = intel_pmu_deliver_pmi, + .cleanup = intel_pmu_cleanup, }; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index f02962dcc72c..4831bc44ce66 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) dest = cpu_physical_id(cpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) dest = cpu_physical_id(vcpu->cpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) */ dest = cpu_physical_id(vcpu->pre_pcpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 90ad7a6246e3..3a6461694fc2 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -44,7 +44,7 @@ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump * to vmx_vmexit. */ -SYM_FUNC_START(vmx_vmenter) +SYM_FUNC_START_LOCAL(vmx_vmenter) /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ je 2f @@ -132,7 +132,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - cmpb $0, %bl + testb %bl, %bl /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 47b8357b9751..32cf8287d4a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -40,7 +40,6 @@ #include <asm/irq_remapping.h> #include <asm/kexec.h> #include <asm/perf_event.h> -#include <asm/mce.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/mwait.h> @@ -51,6 +50,7 @@ #include "capabilities.h" #include "cpuid.h" #include "evmcs.h" +#include "hyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" @@ -553,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = - &vcpu->kvm->arch.hyperv.hv_pa_pg; + &to_kvm_hv(vcpu->kvm)->hv_pa_pg; /* * Synthetic VM-Exit is not enabled in current code and so All * evmcs in singe VM shares same assist page. @@ -659,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr) case MSR_IA32_RTIT_CR3_MATCH: case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + case MSR_LBR_SELECT: + case MSR_LBR_TOS: + case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: + case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: + case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: + case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: + case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: + /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ return true; } @@ -807,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) return *p; } -void update_exception_bitmap(struct kvm_vcpu *vcpu) +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -1103,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ - return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); + return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) @@ -1578,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { orig_rip = kvm_rip_read(vcpu); rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); #ifdef CONFIG_X86_64 @@ -1826,7 +1834,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; @@ -1925,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; goto find_uret_msr; + case MSR_IA32_DEBUGCTLMSR: + msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_info->index); @@ -1948,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } +static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) +{ + u64 debugctl = vmx_supported_debugctl(); + + if (!intel_pmu_lbr_is_enabled(vcpu)) + debugctl &= ~DEBUGCTLMSR_LBR_MASK; + + return debugctl; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -1998,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_DEBUGCTLMSR: { + u64 invalid = data & ~vcpu_supported_debugctl(vcpu); + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + } + + if (invalid) + return 1; + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; - ret = kvm_set_msr_common(vcpu, msr_info); - break; - + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; + } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -2028,7 +2064,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2063,12 +2099,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; @@ -2197,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data >> 32) != 0) return 1; goto find_uret_msr; + case MSR_IA32_PERF_CAPABILITIES: + if (data && !vcpu_to_pmu(vcpu)->version) + return 1; + if (data & PMU_CAP_LBR_FMT) { + if ((data & PMU_CAP_LBR_FMT) != + (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + return 1; + if (!intel_pmu_lbr_is_compatible(vcpu)) + return 1; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break; default: find_uret_msr: @@ -2266,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) u64 msr; cr4_set_bits(X86_CR4_VMXE); - intel_pt_handle_vmx(1); asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) @@ -2277,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - intel_pt_handle_vmx(0); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -2300,9 +2346,13 @@ static int hardware_enable(void) !hv_get_vp_assist_page(cpu)) return -EFAULT; + intel_pt_handle_vmx(1); + r = kvm_cpu_vmxon(phys_addr); - if (r) + if (r) { + intel_pt_handle_vmx(0); return r; + } if (enable_ept) ept_sync_global(); @@ -2320,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } - -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ -static void kvm_cpu_vmxoff(void) -{ - asm volatile (__ex("vmxoff")); - - intel_pt_handle_vmx(0); - cr4_clear_bits(X86_CR4_VMXE); -} - static void hardware_disable(void) { vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + intel_pt_handle_vmx(0); } /* @@ -2429,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_BUS_LOCK_DETECTION; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2740,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); @@ -2820,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); @@ -3095,8 +3138,25 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, vmcs_writel(GUEST_CR3, guest_cr3); } -int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + /* + * We operate under the default treatment of SMM, so VMX cannot be + * enabled under SMM. Note, whether or not VMXE is allowed at all is + * handled by kvm_is_valid_cr4(). + */ + if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) + return false; + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + return false; + + return true; +} + +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = vcpu->arch.cr4; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which @@ -3123,21 +3183,6 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } } - if (cr4 & X86_CR4_VMXE) { - /* - * To use VMXON (and later other VMX instructions), a guest - * must first be able to turn on cr4.VMXE (see handle_vmon()). - * So basically the check on whether to allow nested VMX - * is here. We operate under the default treatment of SMM, - * so VMX cannot be enabled under SMM. - */ - if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) - return 1; - } - - if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) - return 1; - vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); @@ -3168,7 +3213,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); - return 0; + + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + kvm_update_cpuid_runtime(vcpu); } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -3515,42 +3562,33 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) return true; } -static int init_rmode_tss(struct kvm *kvm) +static int init_rmode_tss(struct kvm *kvm, void __user *ua) { - gfn_t fn; - u16 data = 0; - int idx, r; + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + u16 data; + int i; + + for (i = 0; i < 3; i++) { + if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) + return -EFAULT; + } - idx = srcu_read_lock(&kvm->srcu); - fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, - TSS_IOPB_BASE_OFFSET, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; + if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) + return -EFAULT; + data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); -out: - srcu_read_unlock(&kvm->srcu, idx); - return r; + if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) + return -EFAULT; + + return 0; } static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, r = 0; - kvm_pfn_t identity_map_pfn; + void __user *uaddr; u32 tmp; /* Protect kvm_vmx->ept_identity_pagetable_done. */ @@ -3561,24 +3599,24 @@ static int init_rmode_identity_map(struct kvm *kvm) if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm_vmx->ept_identity_map_addr, PAGE_SIZE); - if (r < 0) + uaddr = __x86_set_memory_region(kvm, + IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm_vmx->ept_identity_map_addr, + PAGE_SIZE); + if (IS_ERR(uaddr)) { + r = PTR_ERR(uaddr); goto out; + } - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r < 0) - goto out; /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < PT32_ENT_PER_PAGE; i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, - &tmp, i * sizeof(tmp), sizeof(tmp)); - if (r < 0) + if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { + r = -EFAULT; goto out; + } } kvm_vmx->ept_identity_pagetable_done = true; @@ -3605,19 +3643,22 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; - int r = 0; + void __user *hva; + int ret = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) goto out; - r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (r) + hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (IS_ERR(hva)) { + ret = PTR_ERR(hva); goto out; + } page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { - r = -EFAULT; + ret = -EFAULT; goto out; } @@ -3629,7 +3670,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); - return r; + return ret; } int allocate_vpid(void) @@ -3777,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { if (value) @@ -4236,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - if (!enable_pml) + /* + * PML is enabled/disabled when dirty logging of memsmlots changes, but + * it needs to be set here when dirty logging is already active, e.g. + * if this vCPU was created after dirty logging was enabled. + */ + if (!vcpu->kvm->arch.cpu_dirty_logging_count) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -4254,24 +4300,17 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); - - /* - * Expose INVPCID if and only if PCID is also exposed to the guest. - * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF - * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect - * behavior from the guest perspective (it would expect #GP or #PF). - */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, ENABLE_USR_WAIT_PAUSE, false); + if (!vcpu->kvm->arch.bus_lock_detection_enabled) + exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + vmx->secondary_exec_control = exec_control; } @@ -4470,23 +4509,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); if (init_event) vmx_clear_hlt(vcpu); } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); + vmx_enable_irq_window(vcpu); return; } @@ -4638,7 +4677,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { - int ret; + void __user *ret; if (enable_unrestricted_guest) return 0; @@ -4648,10 +4687,12 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) PAGE_SIZE * 3); mutex_unlock(&kvm->slots_lock); - if (ret) - return ret; + if (IS_ERR(ret)) + return PTR_ERR(ret); + to_kvm_vmx(kvm)->tss_addr = addr; - return init_rmode_tss(kvm); + + return init_rmode_tss(kvm, ret); } static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) @@ -4716,25 +4757,6 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, return 1; } -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s); -#endif -} - static int handle_machine_check(struct kvm_vcpu *vcpu) { /* handled by vmx_vcpu_run() */ @@ -4844,7 +4866,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); fallthrough; case BP_VECTOR: @@ -5069,6 +5091,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int dr, dr7, reg; + int err = 1; exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; @@ -5077,9 +5100,9 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - /* Do not handle if the CPL > 0, will trigger GP on re-entry */ - if (!kvm_require_cpl(vcpu, 0)) - return 1; + if (kvm_x86_ops.get_cpl(vcpu) > 0) + goto out; + dr7 = vmcs_readl(GUEST_DR7); if (dr7 & DR7_GD) { /* @@ -5088,7 +5111,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1; + vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; @@ -5116,14 +5139,15 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (kvm_get_dr(vcpu, dr, &val)) - return 1; + kvm_get_dr(vcpu, dr, &val); kvm_register_write(vcpu, reg, val); - } else - if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) - return 1; + err = 0; + } else { + err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)); + } - return kvm_skip_emulated_instruction(vcpu); +out: + return kvm_complete_insn_gp(vcpu, err); } static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) @@ -5197,9 +5221,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u64 new_bv = kvm_read_edx_eax(vcpu); u32 index = kvm_rcx_read(vcpu); - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - return kvm_skip_emulated_instruction(vcpu); - return 1; + int err = kvm_set_xcr(vcpu, index, new_bv); + return kvm_complete_insn_gp(vcpu, err); } static int handle_apic_access(struct kvm_vcpu *vcpu) @@ -5620,6 +5643,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } +static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + return 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5676,6 +5706,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -5687,7 +5718,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, struct vcpu_vmx *vmx = to_vmx(vcpu); *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -5740,24 +5771,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } -/* - * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. - * Called before reporting dirty_bitmap to userspace. - */ -static void kvm_flush_pml_buffers(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - /* - * We only need to kick vcpu out of guest mode here, as PML buffer - * is flushed at beginning of all VMEXITs, and it's obvious that only - * vcpus running in guest are possible to have unflushed GPAs in PML - * buffer. - */ - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_kick(vcpu); -} - static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", @@ -5928,20 +5941,22 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + u16 exit_handler_index; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. + * flushed already. Note, PML is never enabled in hardware while + * running L2. */ - if (enable_pml) + if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* @@ -5958,6 +5973,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { /* + * PML is never enabled when running L2, bail immediately if a + * PML full exit occurs as something is horribly wrong. + */ + if (exit_reason.basic == EXIT_REASON_PML_FULL) + goto unexpected_vmexit; + + /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC * Page). The CPU may write to these pages via their host @@ -5974,11 +5996,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 1; } - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + if (exit_reason.failed_vmentry) { dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; + = exit_reason.full; vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6000,18 +6022,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * will cause infinite loop. */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_PML_FULL && - exit_reason != EXIT_REASON_APIC_ACCESS && - exit_reason != EXIT_REASON_TASK_SWITCH)) { + (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && + exit_reason.basic != EXIT_REASON_EPT_VIOLATION && + exit_reason.basic != EXIT_REASON_PML_FULL && + exit_reason.basic != EXIT_REASON_APIC_ACCESS && + exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; - if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { vcpu->run->internal.ndata++; vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -6043,42 +6065,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - if (exit_reason >= kvm_vmx_max_exit_handlers) + if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE - if (exit_reason == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); - else if (exit_reason == EXIT_REASON_HLT) + else if (exit_reason.basic == EXIT_REASON_HLT) return kvm_emulate_halt(vcpu); - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) return handle_ept_misconfig(vcpu); #endif - exit_reason = array_index_nospec(exit_reason, - kvm_vmx_max_exit_handlers); - if (!kvm_vmx_exit_handlers[exit_reason]) + exit_handler_index = array_index_nospec((u16)exit_reason.basic, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_handler_index]) goto unexpected_vmexit; - return kvm_vmx_exit_handlers[exit_reason](vcpu); + return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason.full); dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[0] = exit_reason.full; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +{ + int ret = __vmx_handle_exit(vcpu, exit_fastpath); + + /* + * Even when current exit reason is handled by KVM internally, we + * still need to exit to user space when bus lock detected to inform + * that there is a bus lock in guest. + */ + if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (ret > 0) + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + return 0; + } + return ret; +} + /* * Software based L1D cache flush which is used when microcode providing * the cache control MSR is not loaded. @@ -6149,7 +6191,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6393,13 +6435,17 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); - else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); } -static bool vmx_has_emulated_msr(u32 index) +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -6534,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; @@ -6583,7 +6629,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - switch (to_vmx(vcpu)->exit_reason) { + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -6593,8 +6639,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } } -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); - static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { @@ -6654,11 +6698,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; -reenter_guest: /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) @@ -6669,6 +6711,8 @@ reenter_guest: if (vmx->emulation_required) return EXIT_FASTPATH_NONE; + trace_kvm_entry(vcpu); + if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; vmcs_write32(PLE_WINDOW, vmx->ple_window); @@ -6710,6 +6754,8 @@ reenter_guest: pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); + if (intel_pmu_lbr_is_enabled(vcpu)) + vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); @@ -6748,12 +6794,12 @@ reenter_guest: x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) + if (static_branch_unlikely(&enable_evmcs)) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - if (static_branch_unlikely(&enable_evmcs)) - current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; + current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); + } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) @@ -6782,21 +6828,23 @@ reenter_guest: vmx->idt_vectoring_info = 0; if (unlikely(vmx->fail)) { - vmx->exit_reason = 0xdead; + vmx->exit_reason.full = 0xdead; return EXIT_FASTPATH_NONE; } - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)) + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -6804,22 +6852,7 @@ reenter_guest: if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - exit_fastpath = vmx_exit_handlers_fastpath(vcpu); - if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) { - if (!kvm_vcpu_exit_request(vcpu)) { - /* - * FIXME: this goto should be a loop in vcpu_enter_guest, - * but it would incur the cost of a retpoline for now. - * Revisit once static calls are available. - */ - if (vcpu->arch.apicv_active) - vmx_sync_pir_to_irr(vcpu); - goto reenter_guest; - } - exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; - } - - return exit_fastpath; + return vmx_exit_handlers_fastpath(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -6874,11 +6907,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) switch (index) { case MSR_IA32_TSX_CTRL: /* - * No need to pass TSX_CTRL_CPUID_CLEAR through, so - * let's avoid changing CPUID bits under the host - * kernel's feet. + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID + * interception. Keep the host value unchanged to avoid + * changing CPUID bits under the host kernel's feet. + * + * hle=0, rtm=0, tsx_ctrl=1 can be found with some + * combinations of new kernel and old userspace. If + * those guests run on a tsx=off host, do allow guests + * to use TSX_CTRL, but do not change the value on the + * host so that TSX remains always disabled. */ - vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + if (boot_cpu_has(X86_FEATURE_RTM)) + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + else + vmx->guest_uret_msrs[j].mask = 0; break; default: vmx->guest_uret_msrs[j].mask = -1ull; @@ -7261,7 +7303,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); /* Refresh #PF interception to account for MAXPHYADDR changes. */ - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) @@ -7275,8 +7317,8 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0x7 */ if (kvm_mpx_supported()) kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); - if (cpu_has_vmx_invpcid()) - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); + if (!cpu_has_vmx_invpcid()) + kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); @@ -7454,30 +7496,24 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } -static void vmx_slot_enable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); -} - -static void vmx_slot_disable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_mmu_slot_set_dirty(kvm, slot); -} + struct vcpu_vmx *vmx = to_vmx(vcpu); -static void vmx_flush_log_dirty(struct kvm *kvm) -{ - kvm_flush_pml_buffers(kvm); -} + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_cpu_dirty_logging = true; + return; + } -static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *memslot, - gfn_t offset, unsigned long mask) -{ - kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); + /* + * Note, cpu_dirty_logging_count can be changed concurrent with this + * code, but in that case another update request will be made and so + * the guest will never run with a stale PML value. + */ + if (vcpu->kvm->arch.cpu_dirty_logging_count) + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); + else + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -7551,14 +7587,14 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static void enable_smi_window(struct kvm_vcpu *vcpu) +static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.vmxon; + return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } static void vmx_migrate_timers(struct kvm_vcpu *vcpu) @@ -7606,7 +7642,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_exception_bitmap = update_exception_bitmap, + .update_exception_bitmap = vmx_update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, @@ -7616,6 +7652,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .set_cr0 = vmx_set_cr0, + .is_valid_cr4 = vmx_is_valid_cr4, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, .get_idt = vmx_get_idt, @@ -7648,9 +7685,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, @@ -7685,10 +7722,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .sched_in = vmx_sched_in, - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, - .flush_log_dirty = vmx_flush_log_dirty, - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .cpu_dirty_log_size = PML_ENTITY_NUM, + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, .pre_block = vmx_pre_block, .post_block = vmx_post_block, @@ -7708,13 +7743,16 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .enable_smi_window = vmx_enable_smi_window, .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, .msr_filter_changed = vmx_msr_filter_changed, + .complete_emulated_msr = kvm_complete_insn_gp, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; static __init int hardware_setup(void) @@ -7805,6 +7843,8 @@ static __init int hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 48; } + kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) @@ -7827,12 +7867,8 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) { - vmx_x86_ops.slot_enable_log_dirty = NULL; - vmx_x86_ops.slot_disable_log_dirty = NULL; - vmx_x86_ops.flush_log_dirty = NULL; - vmx_x86_ops.enable_log_dirty_pt_masked = NULL; - } + if (!enable_pml) + vmx_x86_ops.cpu_dirty_log_size = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f6f66e5c6510..89da5e1251f1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -70,6 +70,54 @@ struct pt_desc { struct pt_ctx guest; }; +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) +#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) + +bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -117,6 +165,7 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; + bool update_vmcs01_cpu_dirty_logging; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -244,7 +293,7 @@ struct vcpu_vmx { int vpid; bool emulation_required; - u32 exit_reason; + union vmx_exit_reason exit_reason; /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -279,6 +328,7 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ #define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 @@ -321,7 +371,7 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -329,7 +379,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); -void update_exception_bitmap(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); @@ -339,8 +389,12 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value); +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); static inline u8 vmx_get_rvi(void) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e545a8a613b1..f7d12fca397b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -29,6 +29,7 @@ #include "pmu.h" #include "hyperv.h" #include "lapic.h" +#include "xen.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -105,6 +106,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); @@ -113,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu); struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); +#define KVM_X86_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ + *(((struct kvm_x86_ops *)0)->func)); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> +EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); +EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); +EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); + static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); -static bool __read_mostly report_ignored_msrs = true; +bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -135,6 +147,8 @@ u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); u64 __read_mostly kvm_default_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); +bool __read_mostly kvm_has_bus_lock_exit; +EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -197,7 +211,8 @@ EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); -static u64 __read_mostly host_xss; +u64 __read_mostly host_xss; +EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -232,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), - VM_STAT("mmu_pte_updated", mmu_pte_updated), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), VM_STAT("mmu_flooded", mmu_flooded), VM_STAT("mmu_recycled", mmu_recycled), @@ -257,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -393,7 +406,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) @@ -482,19 +495,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) */ vcpu->arch.dr6 &= ~DR_TRAP_BITS; /* - * DR6.RTM is set by all #DB exceptions that don't clear it. + * In order to reflect the #DB exception payload in guest + * dr6, three components need to be considered: active low + * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, + * DR6_BS and DR6_BT) + * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. + * In the target guest dr6: + * FIXED_1 bits should always be set. + * Active low bits should be cleared if 1-setting in payload. + * Active high bits should be set if 1-setting in payload. + * + * Note, the payload is compatible with the pending debug + * exceptions/exit qualification under VMX, that active_low bits + * are active high in payload. + * So they need to be flipped for DR6. */ - vcpu->arch.dr6 |= DR6_RTM; + vcpu->arch.dr6 |= DR6_ACTIVE_LOW; vcpu->arch.dr6 |= payload; - /* - * Bit 16 should be set in the payload whenever the #DB - * exception should clear DR6.RTM. This makes the payload - * compatible with the pending debug exceptions under VMX. - * Though not currently documented in the SDM, this also - * makes the payload compatible with the exit qualification - * for #DB exceptions under VMX. - */ - vcpu->arch.dr6 ^= payload & DR6_RTM; + vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -690,7 +708,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) + if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -740,8 +758,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { - return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | - rsvd_bits(1, 2); + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* @@ -804,11 +821,29 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(pdptrs_changed); +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) +{ + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; + + if ((cr0 ^ old_cr0) & X86_CR0_PG) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } + + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); + + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); +} +EXPORT_SYMBOL_GPL(kvm_post_set_cr0); + int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -832,7 +867,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } @@ -845,20 +880,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops.set_cr0(vcpu, cr0); - - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); + static_call(kvm_x86_set_cr0)(vcpu, cr0); - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); + kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } @@ -872,6 +896,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -892,6 +919,9 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + if (static_cpu_has(X86_FEATURE_PKU) && (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { @@ -955,35 +985,43 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops.get_cpl(vcpu) != 0 || - __kvm_set_xcr(vcpu, index, xcr)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; + if (static_call(kvm_x86_get_cpl)(vcpu) == 0) + return __kvm_set_xcr(vcpu, index, xcr); + + return 1; } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) - return -EINVAL; + return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return -EINVAL; + return false; - return 0; + return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); +} +EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); + +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) +{ + unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (((cr4 ^ old_cr4) & mmu_role_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_valid_cr4); +EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP; - unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; - if (kvm_valid_cr4(vcpu, cr4)) + if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1006,15 +1044,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops.set_cr4(vcpu, cr4)) - return 1; - - if (((cr4 ^ old_cr4) & mmu_role_bits) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) - kvm_mmu_reset_context(vcpu); + static_call(kvm_x86_set_cr4)(vcpu, cr4); - if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } @@ -1040,8 +1072,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && - (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; else if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) @@ -1095,7 +1126,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops.set_dr7(vcpu, dr7); + static_call(kvm_x86_set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1111,7 +1142,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) return fixed; } -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1124,13 +1155,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: case 6: if (!kvm_dr6_valid(val)) - return -1; /* #GP */ + return 1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; case 5: default: /* 7 */ if (!kvm_dr7_valid(val)) - return -1; /* #GP */ + return 1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); break; @@ -1138,18 +1169,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } - -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - if (__kvm_set_dr(vcpu, dr, val)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1166,7 +1188,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = vcpu->arch.dr7; break; } - return 0; } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -1375,16 +1396,24 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; - /* - * On TAA affected systems: - * - nothing to do if TSX is disabled on the host. - * - we emulate TSX_CTRL if present on the host. - * This lets the guest use VERW to clear CPU buffers. - */ - if (!boot_cpu_has(X86_FEATURE_RTM)) - data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); - else if (!boot_cpu_has_bug(X86_BUG_TAA)) + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ + data &= ~ARCH_CAP_TAA_NO; + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } return data; } @@ -1399,7 +1428,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_ops.get_msr_feature(msr); + return static_call(kvm_x86_get_msr_feature)(msr); } return 0; } @@ -1415,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1475,7 +1504,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - r = kvm_x86_ops.set_efer(vcpu, efer); + r = static_call(kvm_x86_set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; @@ -1496,35 +1525,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; - u32 count = kvm->arch.msr_filter.count; - u32 i; - bool r = kvm->arch.msr_filter.default_allow; + bool allowed; int idx; + u32 i; - /* MSR filtering not set up or x2APIC enabled, allow everything */ - if (!count || (index >= 0x800 && index <= 0x8ff)) + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) return true; - /* Prevent collision with set_msr_filter */ idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < count; i++) { + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - r = !!test_bit(index - start, bitmap); + allowed = !!test_bit(index - start, bitmap); break; } } +out: srcu_read_unlock(&kvm->srcu, idx); - return r; + return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1572,7 +1610,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops.set_msr(vcpu, &msr); + return static_call(kvm_x86_set_msr)(vcpu, &msr); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, @@ -1581,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1605,7 +1643,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops.get_msr(vcpu, &msr); + ret = static_call(kvm_x86_get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1619,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } @@ -1638,27 +1676,20 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); -static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) { - if (vcpu->run->msr.error) { - kvm_inject_gp(vcpu, 0); - return 1; - } else if (is_read) { + int err = vcpu->run->msr.error; + if (!err) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } - return kvm_skip_emulated_instruction(vcpu); -} - -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) -{ - return complete_emulated_msr(vcpu, true); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); } static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) { - return complete_emulated_msr(vcpu, false); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static u64 kvm_msr_reason(int r) @@ -1721,18 +1752,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) return 0; } - /* MSR read failed? Inject a #GP */ - if (r) { + if (!r) { + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; } - trace_kvm_msr_read(ecx, data); - - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); - return kvm_skip_emulated_instruction(vcpu); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -1753,24 +1782,21 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) if (r < 0) return r; - /* MSR write failed? Inject a #GP */ - if (r > 0) { + if (!r) + trace_kvm_msr_write(ecx, data); + else trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); -bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { + xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); /* * The fast path for frequent and performance sensitive wrmsr emulation, @@ -1920,15 +1946,14 @@ static s64 get_kvmclock_base_ns(void) } #endif -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; struct pvclock_wall_clock wc; + u32 wc_sec_hi; u64 wall_nsec; - kvm->arch.wall_clock = wall_clock; - if (!wall_clock) return; @@ -1957,6 +1982,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + if (sec_hi_ofs) { + wc_sec_hi = wall_nsec >> 32; + kvm_write_guest(kvm, wall_clock + sec_hi_ofs, + &wc_sec_hi, sizeof(wc_sec_hi)); + } + version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } @@ -2193,7 +2224,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vcpu->arch.l1_tsc_offset = offset; - vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -2528,6 +2559,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + kvm_hv_invalidate_tsc_page(kvm); + spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ @@ -2576,13 +2609,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) return ret; } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +static void kvm_setup_pvclock_page(struct kvm_vcpu *v, + struct gfn_to_hva_cache *cache, + unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) + if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, + &guest_hv_clock, offset, sizeof(guest_hv_clock)))) return; /* This VCPU is paused, but it's legal for a guest to read another @@ -2605,9 +2640,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) ++guest_hv_clock.version; /* first time write, random junk */ vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -2621,16 +2656,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2717,7 +2752,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v); + kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); + if (vcpu->xen.vcpu_info_set) + kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_set) + kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); if (v == kvm_get_vcpu(v->kvm, 0)) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; @@ -2842,32 +2882,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - - if (page_num >= blob_size) - return 1; - - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) - return PTR_ERR(page); - - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { - kfree(page); - return 1; - } - return 0; -} - static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; @@ -2939,13 +2953,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_all(vcpu); + static_call(kvm_x86_tlb_flush_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_guest(vcpu); + static_call(kvm_x86_tlb_flush_guest)(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2953,6 +2967,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) struct kvm_host_map map; struct kvm_steal_time *st; + if (kvm_xen_msr_enabled(vcpu->kvm)) { + kvm_xen_runstate_set_running(vcpu); + return; + } + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -3001,6 +3020,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; + if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + return kvm_xen_write_hypercall_page(vcpu, data); + switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_WRITE: @@ -3057,18 +3079,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* Values other than LBR and BTF are vendor-specific, - thus reserved and should throw a #GP */ - return 1; - } else if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: @@ -3137,13 +3147,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; - kvm_write_wall_clock(vcpu->kvm, data); + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; - kvm_write_wall_clock(vcpu->kvm, data); + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) @@ -3288,8 +3300,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_misc_features_enables = data; break; default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; @@ -3341,7 +3351,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: @@ -3678,6 +3687,27 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 __user *cpuid_arg) +{ + struct kvm_cpuid2 cpuid; + int r; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + return r; + + r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + return r; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + return r; + + return 0; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -3702,7 +3732,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: @@ -3714,6 +3743,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: + case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -3741,6 +3771,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; +#ifdef CONFIG_KVM_XEN + case KVM_CAP_XEN_HVM: + r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | + KVM_XEN_HVM_CONFIG_SHARED_INFO; + if (sched_info_on()) + r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + break; +#endif case KVM_CAP_SYNC_REGS: r = KVM_SYNC_X86_VALID_FIELDS; break; @@ -3762,10 +3801,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); + r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops.cpu_has_accelerated_tpr(); + r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3807,6 +3846,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_STEAL_TIME: r = sched_info_on(); break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + if (kvm_has_bus_lock_exit) + r = KVM_BUS_LOCK_DETECTION_OFF | + KVM_BUS_LOCK_DETECTION_EXIT; + else + r = 0; + break; default: break; } @@ -3899,6 +3945,9 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); + break; default: r = -EINVAL; break; @@ -3921,14 +3970,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops.has_wbinvd_exit()) + if (static_call(kvm_x86_has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops.vcpu_load(vcpu, cpu); + static_call(kvm_x86_vcpu_load)(vcpu, cpu); /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -3974,6 +4023,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; + int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -3981,9 +4031,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - return; + goto out; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -3991,33 +4047,22 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + +out: + srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) + vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); - if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); + if (kvm_xen_msr_enabled(vcpu->kvm)) + kvm_xen_runstate_set_preempted(vcpu); + else + kvm_steal_time_set_preempted(vcpu); - /* - * Disable page faults because we're in atomic context here. - * kvm_write_guest_offset_cached() would call might_fault() - * that relies on pagefault_disable() to tell if there's a - * bug. NOTE: the write to guest memory may not go through if - * during postcopy live migration or if there's heavy guest - * paging. - */ - pagefault_disable(); - /* - * kvm_memslots() will be called by - * kvm_write_guest_offset_cached() so take the srcu lock. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - pagefault_enable(); - kvm_x86_ops.vcpu_put(vcpu); + static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -4031,7 +4076,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -4141,7 +4186,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - kvm_x86_ops.setup_mce(vcpu); + static_call(kvm_x86_setup_mce)(vcpu); out: return r; } @@ -4199,6 +4244,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { process_nmi(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); + /* * In guest mode, payload delivery should be deferred, * so that the L1 hypervisor can intercept #PF before @@ -4245,11 +4293,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); + events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); + events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -4316,13 +4364,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops.set_interrupt_shadow(vcpu, - events->interrupt.shadow); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); + static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -4378,9 +4426,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - if (dbgregs->dr6 & ~0xffffffffull) + if (!kvm_dr6_valid(dbgregs->dr6)) return -EINVAL; - if (dbgregs->dr7 & ~0xffffffffull) + if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); @@ -4481,6 +4529,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { + if (!vcpu->arch.guest_fpu) + return; + if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); @@ -4498,9 +4549,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; + u64 xstate_bv; + u32 mxcsr; + + if (!vcpu->arch.guest_fpu) + return 0; + + xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -4609,7 +4665,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops.enable_direct_tlbflush(vcpu); + return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; @@ -4977,25 +5033,31 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } - case KVM_GET_SUPPORTED_HV_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); + break; +#ifdef CONFIG_KVM_XEN + case KVM_XEN_VCPU_GET_ATTR: { + struct kvm_xen_vcpu_attr xva; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) - goto out; - - r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, - cpuid_arg->entries); - if (r) + if (copy_from_user(&xva, argp, sizeof(xva))) goto out; + r = kvm_xen_vcpu_get_attr(vcpu, &xva); + if (!r && copy_to_user(argp, &xva, sizeof(xva))) + r = -EFAULT; + break; + } + case KVM_XEN_VCPU_SET_ATTR: { + struct kvm_xen_vcpu_attr xva; r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + if (copy_from_user(&xva, argp, sizeof(xva))) goto out; - r = 0; + r = kvm_xen_vcpu_set_attr(vcpu, &xva); break; } +#endif default: r = -EINVAL; } @@ -5017,14 +5079,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops.set_tss_addr(kvm, addr); + ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); + return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -5178,11 +5240,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { + /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. + * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called + * before reporting dirty_bitmap to userspace. KVM flushes the buffers + * on all VM-Exits, thus we only need to kick running vCPUs to force a + * VM-Exit. */ - if (kvm_x86_ops.flush_log_dirty) - kvm_x86_ops.flush_log_dirty(kvm); + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -5272,6 +5341,20 @@ split_irqchip_unlock: kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + r = -EINVAL; + if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) + break; + + if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && + (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) + break; + + if (kvm_has_bus_lock_exit && + cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) + kvm->arch.bus_lock_detection_enabled = true; + r = 0; + break; default: r = -EINVAL; break; @@ -5279,25 +5362,34 @@ split_irqchip_unlock: return r; } -static void kvm_clear_msr_filter(struct kvm *kvm) +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; - u32 count = kvm->arch.msr_filter.count; - struct msr_bitmap_range ranges[16]; - mutex_lock(&kvm->lock); - kvm->arch.msr_filter.count = 0; - memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); - for (i = 0; i < count; i++) - kfree(ranges[i].bitmap); + kfree(msr_filter); } -static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) { - struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; struct msr_bitmap_range range; unsigned long *bitmap = NULL; size_t bitmap_size; @@ -5331,11 +5423,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user goto err; } - /* Everything ok, add this range identifier to our global pool */ - ranges[kvm->arch.msr_filter.count] = range; - /* Make sure we filled the array before we tell anyone to walk it */ - smp_wmb(); - kvm->arch.msr_filter.count++; + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; return 0; err: @@ -5346,10 +5436,11 @@ err: static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) { struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_x86_msr_filter *new_filter, *old_filter; struct kvm_msr_filter filter; bool default_allow; - int r = 0; bool empty = true; + int r = 0; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) @@ -5362,25 +5453,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (empty && !default_allow) return -EINVAL; - kvm_clear_msr_filter(kvm); - - kvm->arch.msr_filter.default_allow = default_allow; + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; - /* - * Protect from concurrent calls to this function that could trigger - * a TOCTOU violation on kvm->arch.msr_filter.count. - */ - mutex_lock(&kvm->lock); for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(kvm, &filter.ranges[i]); - if (r) - break; + r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } } + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); - return r; + return 0; } long kvm_arch_vm_ioctl(struct file *filp, @@ -5596,18 +5694,36 @@ set_pit2_out: kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); break; +#ifdef CONFIG_KVM_XEN case KVM_XEN_HVM_CONFIG: { struct kvm_xen_hvm_config xhc; r = -EFAULT; if (copy_from_user(&xhc, argp, sizeof(xhc))) goto out; - r = -EINVAL; - if (xhc.flags) + r = kvm_xen_hvm_config(kvm, &xhc); + break; + } + case KVM_XEN_HVM_GET_ATTR: { + struct kvm_xen_hvm_attr xha; + + r = -EFAULT; + if (copy_from_user(&xha, argp, sizeof(xha))) goto out; - memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc)); - r = 0; + r = kvm_xen_hvm_get_attr(kvm, &xha); + if (!r && copy_to_user(argp, &xha, sizeof(xha))) + r = -EFAULT; + break; + } + case KVM_XEN_HVM_SET_ATTR: { + struct kvm_xen_hvm_attr xha; + + r = -EFAULT; + if (copy_from_user(&xha, argp, sizeof(xha))) + goto out; + r = kvm_xen_hvm_set_attr(kvm, &xha); break; } +#endif case KVM_SET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; @@ -5650,7 +5766,7 @@ set_pit2_out: case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) - r = kvm_x86_ops.mem_enc_op(kvm, argp); + r = static_call(kvm_x86_mem_enc_op)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5662,7 +5778,7 @@ set_pit2_out: r = -ENOTTY; if (kvm_x86_ops.mem_enc_reg_region) - r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); + r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5674,7 +5790,7 @@ set_pit2_out: r = -ENOTTY; if (kvm_x86_ops.mem_enc_unreg_region) - r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); + r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5776,7 +5892,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) + if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5839,13 +5955,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops.set_segment(vcpu, var, seg); + static_call(kvm_x86_set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops.get_segment(vcpu, var, seg); + static_call(kvm_x86_get_segment)(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -5865,14 +5981,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5880,7 +5996,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5929,7 +6045,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5954,7 +6070,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5975,7 +6091,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) + if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -6028,7 +6144,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) + if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -6053,7 +6169,7 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; - if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) return 1; if (force_emulation_prefix && @@ -6087,7 +6203,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -6495,7 +6611,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops.get_segment_base(vcpu, seg); + return static_call(kvm_x86_get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6508,11 +6624,11 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops.has_wbinvd_exit()) { + if (static_call(kvm_x86_has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -6535,17 +6651,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest) { - return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); + return kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -6613,27 +6729,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); + return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6775,7 +6891,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, + return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, &ctxt->exception); } @@ -6813,7 +6929,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); + static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6829,7 +6945,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); } static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) @@ -6891,7 +7007,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); + u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6902,7 +7018,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops.set_interrupt_shadow(vcpu, mask); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -6944,7 +7060,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -7005,7 +7121,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { + if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -7065,9 +7181,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); if (indirect_shadow_pages) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -7174,7 +7290,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -7186,10 +7302,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); int r; - r = kvm_x86_ops.skip_emulated_instruction(vcpu); + r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; @@ -7218,7 +7334,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -7275,6 +7391,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +/* + * Decode to be emulated instruction. Return EMULATION_OK if success. + */ +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len) +{ + int r = EMULATION_OK; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since we do not set + * complete_userspace_io. This does not handle watchpoints yet, + * those would be handled in the emulate_ops. + */ + if (!(emulation_type & EMULTYPE_SKIP) && + kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; + + r = x86_decode_insn(ctxt, insn, insn_len); + + trace_kvm_emulate_insn_start(vcpu); + ++vcpu->stat.insn_emulation; + + return r; +} +EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { @@ -7283,7 +7435,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool writeback = true; bool write_fault_to_spt; - if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -7294,32 +7446,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; - kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); - - /* - * We will reenter on the same instruction since - * we do not set complete_userspace_io. This does not - * handle watchpoints yet, those would be handled in - * the emulate_ops. - */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) - return r; - - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; + kvm_clear_exception_queue(vcpu); - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); - - trace_kvm_emulate_insn_start(vcpu); - ++vcpu->stat.insn_emulation; + r = x86_decode_emulated_instruction(vcpu, emulation_type, + insn, insn_len); if (r != EMULATION_OK) { if ((emulation_type & EMULTYPE_TRAP_UD) || (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { @@ -7426,7 +7558,7 @@ restart: r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || @@ -7435,7 +7567,7 @@ restart: if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); if (kvm_x86_ops.update_emulated_instruction) - kvm_x86_ops.update_emulated_instruction(vcpu); + static_call(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7764,7 +7896,7 @@ static int kvm_is_user_mode(void) int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); + user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -7909,7 +8041,6 @@ int kvm_arch_init(void *opaque) supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } - kvm_lapic_init(); if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 @@ -7951,19 +8082,28 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); +#ifdef CONFIG_KVM_XEN + static_key_deferred_flush(&kvm_xen_enabled); + WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); +#endif } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = state; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = reason; return 0; } } + +int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +{ + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -7977,6 +8117,14 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -7989,7 +8137,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) return -KVM_EOPNOTSUPP; - if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + if (!kvm_get_walltime_and_clockread(&ts, &cycle)) return -KVM_EOPNOTSUPP; clock_pairing.sec = ts.tv_sec; @@ -8065,7 +8213,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit; - if (kvm_hv_hypercall_enabled(vcpu->kvm)) + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); nr = kvm_rax_read(vcpu); @@ -8085,7 +8236,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops.get_cpl(vcpu) != 0) { + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { ret = -KVM_EPERM; goto out; } @@ -8142,7 +8293,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops.patch_hypercall(vcpu, instruction); + static_call(kvm_x86_patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -8158,13 +8309,22 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; + /* + * if_flag is obsolete and useless, so do not bother + * setting it for SEV-ES guests. Userspace can just + * use kvm_run->ready_for_interrupt_injection. + */ + kvm_run->if_flag = !vcpu->arch.guest_state_protected + && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + + if (is_smm(vcpu)) + kvm_run->flags |= KVM_RUN_X86_SMM; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -8190,7 +8350,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); + static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); } static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) @@ -8201,7 +8361,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) { - kvm_x86_ops.queue_exception(vcpu); + static_call(kvm_x86_queue_exception)(vcpu); can_inject = false; } /* @@ -8220,10 +8380,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - kvm_x86_ops.set_nmi(vcpu); + static_call(kvm_x86_set_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - kvm_x86_ops.set_irq(vcpu); + static_call(kvm_x86_set_irq)(vcpu); can_inject = false; } } @@ -8264,7 +8424,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit } } - kvm_x86_ops.queue_exception(vcpu); + static_call(kvm_x86_queue_exception)(vcpu); can_inject = false; } @@ -8280,7 +8440,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ if (vcpu->arch.smi_pending) { - r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { @@ -8289,35 +8449,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit enter_smm(vcpu); can_inject = false; } else - kvm_x86_ops.enable_smi_window(vcpu); + static_call(kvm_x86_enable_smi_window)(vcpu); } if (vcpu->arch.nmi_pending) { - r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - kvm_x86_ops.set_nmi(vcpu); + static_call(kvm_x86_set_nmi)(vcpu); can_inject = false; - WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); + WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } if (vcpu->arch.nmi_pending) - kvm_x86_ops.enable_nmi_window(vcpu); + static_call(kvm_x86_enable_nmi_window)(vcpu); } if (kvm_cpu_has_injectable_intr(vcpu)) { - r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops.set_irq(vcpu); - WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); + static_call(kvm_x86_set_irq)(vcpu); + WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) - kvm_x86_ops.enable_irq_window(vcpu); + static_call(kvm_x86_enable_irq_window)(vcpu); } if (is_guest_mode(vcpu) && @@ -8342,7 +8502,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -8432,11 +8592,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -8486,7 +8646,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -8496,7 +8656,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -8526,30 +8686,30 @@ static void enter_smm(struct kvm_vcpu *vcpu) * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops.pre_enter_smm(vcpu, buf); + static_call(kvm_x86_pre_enter_smm)(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops.get_nmi_mask(vcpu)) + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops.set_nmi_mask(vcpu, true); + static_call(kvm_x86_set_nmi_mask)(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops.set_cr0(vcpu, cr0); + static_call(kvm_x86_set_cr0)(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops.set_cr4(vcpu, 0); + static_call(kvm_x86_set_cr4)(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops.set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); - __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + kvm_set_dr(vcpu, 7, DR7_FIXED_1); cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; cs.base = vcpu->arch.smbase; @@ -8578,7 +8738,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops.set_efer(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); #endif kvm_update_cpuid_runtime(vcpu); @@ -8616,7 +8776,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); - kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); + static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -8633,7 +8793,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) + !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); @@ -8653,7 +8813,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); + static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); /* * Sending request to update APICV for all other vcpus, @@ -8679,7 +8839,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8697,9 +8857,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, - vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); + if (to_hv_vcpu(vcpu)) + bitmap_or((ulong *)eoi_exit_bitmap, + vcpu->arch.ioapic_handled_vectors, + to_hv_synic(vcpu)->vec_bitmap, 256); + + static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -8724,7 +8887,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!kvm_x86_ops.set_apic_access_page_addr) return; - kvm_x86_ops.set_apic_access_page_addr(vcpu); + static_call(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8748,6 +8911,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; + /* Forbid vmenter if vcpu dirty ring is soft-full */ + if (unlikely(vcpu->kvm->dirty_ring_size && + kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; + trace_kvm_dirty_ring_exit(vcpu); + r = 0; + goto out; + } + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { @@ -8840,8 +9012,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv = vcpu->arch.hyperv.exit; + vcpu->run->hyperv = hv_vcpu->exit; r = 0; goto out; } @@ -8858,10 +9032,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_ops.msr_filter_changed(vcpu); + static_call(kvm_x86_msr_filter_changed)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) + static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || + kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -8871,7 +9049,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) inject_pending_event(vcpu, &req_immediate_exit); if (req_int_win) - kvm_x86_ops.enable_irq_window(vcpu); + static_call(kvm_x86_enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -8886,7 +9064,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops.prepare_guest_switch(vcpu); + static_call(kvm_x86_prepare_guest_switch)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8917,7 +9095,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -8931,11 +9109,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops.request_immediate_exit(vcpu); + static_call(kvm_x86_request_immediate_exit)(vcpu); } - trace_kvm_entry(vcpu); - fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); @@ -8950,7 +9126,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - exit_fastpath = kvm_x86_ops.run(vcpu); + for (;;) { + exit_fastpath = static_call(kvm_x86_run)(vcpu); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + break; + } + + if (vcpu->arch.apicv_active) + static_call(kvm_x86_sync_pir_to_irr)(vcpu); + } /* * Do this here before restoring debug registers on the host. And @@ -8960,7 +9148,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops.sync_dirty_debug_regs(vcpu); + static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; @@ -8982,7 +9170,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu); + static_call(kvm_x86_handle_exit_irqoff)(vcpu); /* * Consume any pending interrupts, including the possible source of @@ -9024,13 +9212,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); + r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops.cancel_injection(vcpu); + static_call(kvm_x86_cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -9040,13 +9228,13 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (kvm_x86_ops.post_block) - kvm_x86_ops.post_block(vcpu); + static_call(kvm_x86_post_block)(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -9055,6 +9243,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9223,9 +9412,14 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops.run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); + /* + * Guests with protected state can't have it set by the hypervisor, + * so skip trying to set it. + */ + if (vcpu->arch.guest_fpu) + /* PKRU is separately restored in kvm_x86_ops.run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, + ~XFEATURE_MASK_PKRU); fpregs_mark_activate(); fpregs_unlock(); @@ -9238,7 +9432,12 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - kvm_save_current_fpu(vcpu->arch.guest_fpu); + /* + * Guests with protected state can't have it read by the hypervisor, + * so skip trying to save it. + */ + if (vcpu->arch.guest_fpu) + kvm_save_current_fpu(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); @@ -9256,6 +9455,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu_load(vcpu); kvm_sigset_activate(vcpu); + kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { @@ -9417,6 +9617,9 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -9427,16 +9630,18 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; - sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = kvm_read_cr3(vcpu); + +skip_protected_regs: + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; @@ -9466,8 +9671,9 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || + vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && + vcpu->arch.pv.pv_unhalted) mp_state->mp_state = KVM_MP_STATE_RUNNABLE; else mp_state->mp_state = vcpu->arch.mp_state; @@ -9535,7 +9741,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -9543,31 +9749,31 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE) - || !(sregs->efer & EFER_LMA)) - return -EINVAL; + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; + if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code * segment cannot be 64-bit. */ if (sregs->efer & EFER_LMA || sregs->cs.l) - return -EINVAL; + return false; } - return kvm_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; - int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; int ret = -EINVAL; - if (kvm_valid_sregs(vcpu, sregs)) + if (!kvm_is_valid_sregs(vcpu, sregs)) goto out; apic_base_msr.data = sregs->apic_base; @@ -9575,12 +9781,15 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (kvm_set_apic_base(vcpu, &apic_base_msr)) goto out; + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops.set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops.set_gdt(vcpu, &dt); + static_call(kvm_x86_set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; @@ -9590,18 +9799,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops.set_efer(vcpu, sregs->efer); + static_call(kvm_x86_set_efer)(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops.set_cr0(vcpu, sregs->cr0); + static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & - (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops.set_cr4(vcpu, sregs->cr4); - if (cpuid_update_needed) - kvm_update_cpuid_runtime(vcpu); + static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -9613,14 +9818,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = KVM_NR_INTERRUPTS; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, false); - pr_debug("Set back pending irq %d\n", pending_vec); - } - kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -9639,6 +9836,15 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +skip_protected_regs: + max_bits = KVM_NR_INTERRUPTS; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -9663,6 +9869,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + if (vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { @@ -9705,7 +9914,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_exception_bitmap(vcpu); + static_call(kvm_x86_update_exception_bitmap)(vcpu); r = 0; @@ -9742,6 +9951,9 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (!vcpu->arch.guest_fpu) + return 0; + vcpu_load(vcpu); fxsave = &vcpu->arch.guest_fpu->state.fxsave; @@ -9762,6 +9974,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (!vcpu->arch.guest_fpu) + return 0; + vcpu_load(vcpu); fxsave = &vcpu->arch.guest_fpu->state.fxsave; @@ -9820,6 +10035,9 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { + if (!vcpu->arch.guest_fpu) + return; + fpstate_init(&vcpu->arch.guest_fpu->state); if (boot_cpu_has(X86_FEATURE_XSAVES)) vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = @@ -9833,6 +10051,15 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_fpu) { + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + vcpu->arch.guest_fpu = NULL; + } +} +EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -9865,11 +10092,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) vcpu->arch.apicv_active = true; } else - static_key_slow_inc(&kvm_no_apic_vcpu); + static_branch_inc(&kvm_has_noapic_vcpu); r = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); @@ -9903,6 +10130,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -9912,9 +10140,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; vcpu->arch.preempted_in_kernel = false; - kvm_hv_vcpu_init(vcpu); - - r = kvm_x86_ops.vcpu_create(vcpu); + r = static_call(kvm_x86_vcpu_create)(vcpu); if (r) goto free_guest_fpu; @@ -9928,7 +10154,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kvm_free_guest_fpu(vcpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_emulate_ctxt: @@ -9950,8 +10176,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - kvm_hv_vcpu_postcreate(vcpu); - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); @@ -9977,12 +10201,12 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - kvm_x86_ops.vcpu_free(vcpu); + static_call(kvm_x86_vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kvm_free_guest_fpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -9994,7 +10218,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); + static_branch_dec(&kvm_has_noapic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -10013,7 +10237,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); - vcpu->arch.dr6 = DR6_INIT; + vcpu->arch.dr6 = DR6_ACTIVE_LOW; vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -10030,7 +10254,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (kvm_mpx_supported()) { + if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { void *mpx_state_buffer; /* @@ -10066,7 +10290,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; - kvm_x86_ops.vcpu_reset(vcpu, init_event); + static_call(kvm_x86_vcpu_reset)(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -10079,6 +10303,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } +EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); int kvm_arch_hardware_enable(void) { @@ -10091,7 +10316,7 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); - ret = kvm_x86_ops.hardware_enable(); + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -10173,7 +10398,7 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops.hardware_disable(); + static_call(kvm_x86_hardware_disable)(); drop_user_return_notifiers(); } @@ -10192,6 +10417,7 @@ int kvm_arch_hardware_setup(void *opaque) return r; memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + kvm_ops_static_call_update(); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -10220,7 +10446,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops.hardware_unsetup(); + static_call(kvm_x86_hardware_unsetup)(); } int kvm_arch_check_processor_compat(void *opaque) @@ -10248,8 +10474,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -struct static_key kvm_no_apic_vcpu __read_mostly; -EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); +__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { @@ -10260,12 +10486,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops.sched_in(vcpu, cpu); + static_call(kvm_x86_sched_in)(vcpu, cpu); } void kvm_arch_free_vm(struct kvm *kvm) { - kfree(kvm->arch.hyperv.hv_pa_pg); + kfree(to_kvm_hv(kvm)->hv_pa_pg); vfree(kvm); } @@ -10304,7 +10530,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - return kvm_x86_ops.vm_init(kvm); + return static_call(kvm_x86_vm_init)(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -10349,7 +10575,32 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) + +/** + * __x86_set_memory_region: Setup KVM internal memory slot + * + * @kvm: the kvm pointer to the VM. + * @id: the slot ID to setup. + * @gpa: the GPA to install the slot (unused when @size == 0). + * @size: the size of the slot. Set to zero to uninstall a slot. + * + * This function helps to setup a KVM internal memory slot. Specify + * @size > 0 to install a new slot, while @size == 0 to uninstall a + * slot. The return code can be one of the following: + * + * HVA: on success (uninstall will return a bogus HVA) + * -errno: on error + * + * The caller should always use IS_ERR() to check the return value + * before use. Note, the KVM internal memory slots are guaranteed to + * remain valid and unchanged until the VM is destroyed, i.e., the + * GPA->HVA translation will not change. However, the HVA is a user + * address, i.e. its accessibility is not guaranteed, and must be + * accessed via __copy_{to,from}_user(). + */ +void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size) { int i, r; unsigned long hva, old_npages; @@ -10358,12 +10609,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) - return -EINVAL; + return ERR_PTR_USR(-EINVAL); slot = id_to_memslot(slots, id); if (size) { if (slot && slot->npages) - return -EEXIST; + return ERR_PTR_USR(-EEXIST); /* * MAP_SHARED to prevent internal slot pages from being moved @@ -10372,13 +10623,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); if (IS_ERR((void *)hva)) - return PTR_ERR((void *)hva); + return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; - hva = 0; + hva = slot->userspace_addr; } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -10391,13 +10642,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) - return r; + return ERR_PTR_USR(r); } if (!size) vm_munmap(hva, old_npages * PAGE_SIZE); - return 0; + return (void __user *)hva; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); @@ -10408,8 +10659,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - u32 i; - if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10424,10 +10673,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops.vm_destroy) - kvm_x86_ops.vm_destroy(kvm); - for (i = 0; i < kvm->arch.msr_filter.count; i++) - kfree(kvm->arch.msr_filter.ranges[i].bitmap); + static_call_cond(kvm_x86_vm_destroy)(kvm); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -10435,6 +10682,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); + kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); } @@ -10553,76 +10801,97 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } + +static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) +{ + struct kvm_arch *ka = &kvm->arch; + + if (!kvm_x86_ops.cpu_dirty_log_size) + return; + + if ((enable && ++ka->cpu_dirty_logging_count == 1) || + (!enable && --ka->cpu_dirty_logging_count == 0)) + kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); + + WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); +} + static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { + bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; + /* - * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. - * See comments below. + * Update CPU dirty logging if dirty logging is being toggled. This + * applies to all operations. */ - if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) - return; + if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); /* - * Dirty logging tracks sptes in 4k granularity, meaning that large - * sptes have to be split. If live migration is successful, the guest - * in the source machine will be destroyed and large sptes will be - * created in the destination. However, if the guest continues to run - * in the source machine (for example if live migration fails), small - * sptes will remain around and cause bad performance. - * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Nothing more to do for RO slots (which can't be dirtied and can't be + * made writable) or CREATE/MOVE/DELETE of a slot. * - * There is no need to do this in any of the following cases: + * For a memslot with dirty logging disabled: * CREATE: No dirty mappings will already exist. * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot() + * + * For a memslot with dirty logging enabled: + * CREATE: No shadow pages exist, thus nothing to write-protect + * and no dirty bits to clear. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot(). */ - if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_zap_collapsible_sptes(kvm, new); + if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) + return; /* - * Enable or disable dirty logging for the slot. - * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old - * slot have been zapped so no dirty logging updates are needed for - * the old slot. - * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible - * any mappings that might be created in it will consume the - * properties of the new slot and do not need to be updated here. - * - * When PML is enabled, the kvm_x86_ops dirty logging hooks are - * called to enable/disable dirty logging. - * - * When disabling dirty logging with PML enabled, the D-bit is set - * for sptes in the slot in order to prevent unnecessary GPA - * logging in the PML buffer (and potential PML buffer full VMEXIT). - * This guarantees leaving PML enabled for the guest's lifetime - * won't have any additional overhead from PML when the guest is - * running with dirty logging disabled. - * - * When enabling dirty logging, large sptes are write-protected - * so they can be split on first write. New large sptes cannot - * be created for this slot until the end of the logging. - * See the comments in fast_page_fault(). - * For small sptes, nothing is done if the dirty log is in the - * initial-all-set state. Otherwise, depending on whether pml - * is enabled the D-bit or the W-bit will be cleared. + * READONLY and non-flags changes were filtered out above, and the only + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty + * logging isn't being toggled on or off. */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops.slot_enable_log_dirty) { - kvm_x86_ops.slot_enable_log_dirty(kvm, new); - } else { - int level = - kvm_dirty_log_manual_protect_and_init_set(kvm) ? - PG_LEVEL_2M : PG_LEVEL_4K; + if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + return; + + if (!log_dirty_pages) { + /* + * Dirty logging tracks sptes in 4k granularity, meaning that + * large sptes have to be split. If live migration succeeds, + * the guest in the source machine will be destroyed and large + * sptes will be created in the destination. However, if the + * guest continues to run in the source machine (for example if + * live migration fails), small sptes will remain around and + * cause bad performance. + * + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. + */ + kvm_mmu_zap_collapsible_sptes(kvm, new); + } else { + /* By default, write-protect everything to log writes. */ + int level = PG_LEVEL_4K; + + if (kvm_x86_ops.cpu_dirty_log_size) { + /* + * Clear all dirty bits, unless pages are treated as + * dirty from the get-go. + */ + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, new); /* + * Write-protect large pages on write so that dirty + * logging happens at 4k granularity. No need to + * write-protect small SPTEs since write accesses are + * logged by the CPU via dirty bits. + */ + level = PG_LEVEL_2M; + } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { + /* * If we're with initial-all-set, we don't need * to write protect any small page because * they're reported as dirty already. However @@ -10630,11 +10899,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * so that the page split can happen lazily on * the first write to the huge page. */ - kvm_mmu_slot_remove_write_access(kvm, new, level); + level = PG_LEVEL_2M; } - } else { - if (kvm_x86_ops.slot_disable_log_dirty) - kvm_x86_ops.slot_disable_log_dirty(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, level); } } @@ -10673,7 +10940,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && kvm_x86_ops.guest_apic_has_interrupt && - kvm_x86_ops.guest_apic_has_interrupt(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10692,12 +10959,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu, false))) + static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && - kvm_x86_ops.smi_allowed(vcpu, false))) + static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10731,7 +10998,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -10749,11 +11016,15 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.interrupt_allowed(vcpu, false); + return static_call(kvm_x86_interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { + /* Can't read the RIP when guest state is protected, just return 0 */ + if (vcpu->arch.guest_state_protected) + return 0; + if (is_64_bit_mode(vcpu)) return kvm_rip_read(vcpu); return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + @@ -10771,7 +11042,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops.get_rflags(vcpu); + rflags = static_call(kvm_x86_get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10783,7 +11054,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops.set_rflags(vcpu, rflags); + static_call(kvm_x86_set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -10913,7 +11184,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) return false; if (!kvm_pv_async_pf_enabled(vcpu) || - (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) + (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0)) return false; return true; @@ -11058,7 +11329,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, + ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -11083,7 +11354,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -11094,7 +11365,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) @@ -11263,6 +11534,180 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; + unsigned int len; + + BUG_ON(!vcpu->mmio_needed); + + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); + if (!vcpu->mmio_is_write) + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + + // VMG change, at this point, we're always done + // RIP has already been advanced + return 1; + } + + // More MMIO is needed + run->mmio.phys_addr = frag->gpa; + run->mmio.len = min(8u, frag->len); + run->mmio.is_write = vcpu->mmio_is_write; + if (run->mmio.is_write) + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} + +int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 1; + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); + +int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); + +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, + vcpu->arch.pio.count * vcpu->arch.pio.size); + vcpu->arch.pio.count = 0; + + return 1; +} + +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count) +{ + int ret; + + ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, + data, count); + if (ret) + return ret; + + vcpu->arch.pio.count = 0; + + return 0; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count) +{ + int ret; + + ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, + data, count); + if (ret) { + vcpu->arch.pio.count = 0; + } else { + vcpu->arch.guest_ins_data = data; + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; + } + + return 0; +} + +int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count, + int in) +{ + return in ? kvm_sev_es_ins(vcpu, size, port, data, count) + : kvm_sev_es_outs(vcpu, size, port, data, count); +} +EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); @@ -11285,3 +11730,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e7ca622a468f..9035e34aa156 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -3,6 +3,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/mce.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -97,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) if (!is_long_mode(vcpu)) return false; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } @@ -128,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_current(vcpu); + static_call(kvm_x86_tlb_flush_current)(vcpu); } static inline int is_pae(struct kvm_vcpu *vcpu) @@ -243,12 +244,12 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) { - return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); + return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu); } +void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, @@ -272,12 +273,15 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 supported_xcr0; +extern u64 host_xss; extern u64 supported_xss; static inline bool kvm_mpx_supported(void) @@ -294,6 +298,8 @@ extern int pi_inject_timer; extern struct static_key kvm_no_apic_vcpu; +extern bool report_ignored_msrs; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, @@ -366,11 +372,29 @@ static inline bool kvm_dr6_valid(u64 data) return !(data >> 32); } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static inline void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s); +#endif +} + void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); @@ -404,7 +428,17 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits |= X86_CR4_UMIP; \ if (!__cpu_has(__c, X86_FEATURE_VMX)) \ __reserved_bits |= X86_CR4_VMXE; \ + if (!__cpu_has(__c, X86_FEATURE_PCID)) \ + __reserved_bits |= X86_CR4_PCIDE; \ __reserved_bits; \ }) +int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes, + void *dst); +int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes, + void *dst); +int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count, + int in); + #endif diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c new file mode 100644 index 000000000000..ae17250e1efe --- /dev/null +++ b/arch/x86/kvm/xen.c @@ -0,0 +1,721 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * KVM Xen emulation + */ + +#include "x86.h" +#include "xen.h" +#include "hyperv.h" + +#include <linux/kvm_host.h> +#include <linux/sched/stat.h> + +#include <trace/events/kvm.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> + +#include "trace.h" + +DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); + +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + gpa_t gpa = gfn_to_gpa(gfn); + int wc_ofs, sec_hi_ofs; + int ret; + int idx = srcu_read_lock(&kvm->srcu); + + ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, + gpa, PAGE_SIZE); + if (ret) + goto out; + + kvm->arch.xen.shinfo_set = true; + + /* Paranoia checks on the 32-bit struct layout */ + BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); + BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + /* 32-bit location by default */ + wc_ofs = offsetof(struct compat_shared_info, wc); + sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi); + +#ifdef CONFIG_X86_64 + /* Paranoia checks on the 64-bit struct layout */ + BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); + BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); + + if (kvm->arch.xen.long_mode) { + wc_ofs = offsetof(struct shared_info, wc); + sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi); + } +#endif + + kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs); + kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); + +out: + srcu_read_unlock(&kvm->srcu, idx); + return ret; +} + +static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + u64 now = get_kvmclock_ns(v->kvm); + u64 delta_ns = now - vx->runstate_entry_time; + u64 run_delay = current->sched_info.run_delay; + + if (unlikely(!vx->runstate_entry_time)) + vx->current_runstate = RUNSTATE_offline; + + /* + * Time waiting for the scheduler isn't "stolen" if the + * vCPU wasn't running anyway. + */ + if (vx->current_runstate == RUNSTATE_running) { + u64 steal_ns = run_delay - vx->last_steal; + + delta_ns -= steal_ns; + + vx->runstate_times[RUNSTATE_runnable] += steal_ns; + } + vx->last_steal = run_delay; + + vx->runstate_times[vx->current_runstate] += delta_ns; + vx->current_runstate = state; + vx->runstate_entry_time = now; +} + +void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + uint64_t state_entry_time; + unsigned int offset; + + kvm_xen_update_runstate(v, state); + + if (!vx->runstate_set) + return; + + BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + + offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time); +#ifdef CONFIG_X86_64 + /* + * The only difference is alignment of uint64_t in 32-bit. + * So the first field 'state' is accessed directly using + * offsetof() (where its offset happens to be zero), while the + * remaining fields which are all uint64_t, start at 'offset' + * which we tweak here by adding 4. + */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != + offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != + offsetof(struct compat_vcpu_runstate_info, time) + 4); + + if (v->kvm->arch.xen.long_mode) + offset = offsetof(struct vcpu_runstate_info, state_entry_time); +#endif + /* + * First write the updated state_entry_time at the appropriate + * location determined by 'offset'. + */ + state_entry_time = vx->runstate_entry_time; + state_entry_time |= XEN_RUNSTATE_UPDATE; + + BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) != + sizeof(state_entry_time)); + BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) != + sizeof(state_entry_time)); + + if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, + &state_entry_time, offset, + sizeof(state_entry_time))) + return; + smp_wmb(); + + /* + * Next, write the new runstate. This is in the *same* place + * for 32-bit and 64-bit guests, asserted here for paranoia. + */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != + offsetof(struct compat_vcpu_runstate_info, state)); + BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) != + sizeof(vx->current_runstate)); + BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) != + sizeof(vx->current_runstate)); + + if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, + &vx->current_runstate, + offsetof(struct vcpu_runstate_info, state), + sizeof(vx->current_runstate))) + return; + + /* + * Write the actual runstate times immediately after the + * runstate_entry_time. + */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != + offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); + BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); + BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != + sizeof(((struct compat_vcpu_runstate_info *)0)->time)); + BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != + sizeof(vx->runstate_times)); + + if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, + &vx->runstate_times[0], + offset + sizeof(u64), + sizeof(vx->runstate_times))) + return; + + smp_wmb(); + + /* + * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's + * runstate_entry_time field. + */ + + state_entry_time &= ~XEN_RUNSTATE_UPDATE; + if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, + &state_entry_time, offset, + sizeof(state_entry_time))) + return; +} + +int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +{ + u8 rc = 0; + + /* + * If the global upcall vector (HVMIRQ_callback_vector) is set and + * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. + */ + struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); + + /* No need for compat handling here */ + BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != + offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + + /* + * For efficiency, this mirrors the checks for using the valid + * cache in kvm_read_guest_offset_cached(), but just uses + * __get_user() instead. And falls back to the slow path. + */ + if (likely(slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + /* Fast path */ + __get_user(rc, (u8 __user *)ghc->hva + offset); + } else { + /* Slow path */ + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + } + + return rc; +} + +int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&kvm->lock); + + switch (data->type) { + case KVM_XEN_ATTR_TYPE_LONG_MODE: + if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { + r = -EINVAL; + } else { + kvm->arch.xen.long_mode = !!data->u.long_mode; + r = 0; + } + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO: + if (data->u.shared_info.gfn == GPA_INVALID) { + kvm->arch.xen.shinfo_set = false; + r = 0; + break; + } + r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + break; + + + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + if (data->u.vector && data->u.vector < 0x10) + r = -EINVAL; + else { + kvm->arch.xen.upcall_vector = data->u.vector; + r = 0; + } + break; + + default: + break; + } + + mutex_unlock(&kvm->lock); + return r; +} + +int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&kvm->lock); + + switch (data->type) { + case KVM_XEN_ATTR_TYPE_LONG_MODE: + data->u.long_mode = kvm->arch.xen.long_mode; + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO: + if (kvm->arch.xen.shinfo_set) + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); + else + data->u.shared_info.gfn = GPA_INVALID; + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + data->u.vector = kvm->arch.xen.upcall_vector; + r = 0; + break; + + default: + break; + } + + mutex_unlock(&kvm->lock); + return r; +} + +int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) +{ + int idx, r = -ENOENT; + + mutex_lock(&vcpu->kvm->lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (data->type) { + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + /* No compat necessary here. */ + BUILD_BUG_ON(sizeof(struct vcpu_info) != + sizeof(struct compat_vcpu_info)); + BUILD_BUG_ON(offsetof(struct vcpu_info, time) != + offsetof(struct compat_vcpu_info, time)); + + if (data->u.gpa == GPA_INVALID) { + vcpu->arch.xen.vcpu_info_set = false; + r = 0; + break; + } + + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, + sizeof(struct vcpu_info)); + if (!r) { + vcpu->arch.xen.vcpu_info_set = true; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: + if (data->u.gpa == GPA_INVALID) { + vcpu->arch.xen.vcpu_time_info_set = false; + r = 0; + break; + } + + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); + if (!r) { + vcpu->arch.xen.vcpu_time_info_set = true; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + if (data->u.gpa == GPA_INVALID) { + vcpu->arch.xen.runstate_set = false; + r = 0; + break; + } + + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.xen.runstate_cache, + data->u.gpa, + sizeof(struct vcpu_runstate_info)); + if (!r) { + vcpu->arch.xen.runstate_set = true; + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + if (data->u.runstate.state > RUNSTATE_offline) { + r = -EINVAL; + break; + } + + kvm_xen_update_runstate(vcpu, data->u.runstate.state); + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + if (data->u.runstate.state > RUNSTATE_offline) { + r = -EINVAL; + break; + } + if (data->u.runstate.state_entry_time != + (data->u.runstate.time_running + + data->u.runstate.time_runnable + + data->u.runstate.time_blocked + + data->u.runstate.time_offline)) { + r = -EINVAL; + break; + } + if (get_kvmclock_ns(vcpu->kvm) < + data->u.runstate.state_entry_time) { + r = -EINVAL; + break; + } + + vcpu->arch.xen.current_runstate = data->u.runstate.state; + vcpu->arch.xen.runstate_entry_time = + data->u.runstate.state_entry_time; + vcpu->arch.xen.runstate_times[RUNSTATE_running] = + data->u.runstate.time_running; + vcpu->arch.xen.runstate_times[RUNSTATE_runnable] = + data->u.runstate.time_runnable; + vcpu->arch.xen.runstate_times[RUNSTATE_blocked] = + data->u.runstate.time_blocked; + vcpu->arch.xen.runstate_times[RUNSTATE_offline] = + data->u.runstate.time_offline; + vcpu->arch.xen.last_steal = current->sched_info.run_delay; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + if (data->u.runstate.state > RUNSTATE_offline && + data->u.runstate.state != (u64)-1) { + r = -EINVAL; + break; + } + /* The adjustment must add up */ + if (data->u.runstate.state_entry_time != + (data->u.runstate.time_running + + data->u.runstate.time_runnable + + data->u.runstate.time_blocked + + data->u.runstate.time_offline)) { + r = -EINVAL; + break; + } + + if (get_kvmclock_ns(vcpu->kvm) < + (vcpu->arch.xen.runstate_entry_time + + data->u.runstate.state_entry_time)) { + r = -EINVAL; + break; + } + + vcpu->arch.xen.runstate_entry_time += + data->u.runstate.state_entry_time; + vcpu->arch.xen.runstate_times[RUNSTATE_running] += + data->u.runstate.time_running; + vcpu->arch.xen.runstate_times[RUNSTATE_runnable] += + data->u.runstate.time_runnable; + vcpu->arch.xen.runstate_times[RUNSTATE_blocked] += + data->u.runstate.time_blocked; + vcpu->arch.xen.runstate_times[RUNSTATE_offline] += + data->u.runstate.time_offline; + + if (data->u.runstate.state <= RUNSTATE_offline) + kvm_xen_update_runstate(vcpu, data->u.runstate.state); + r = 0; + break; + + default: + break; + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + mutex_unlock(&vcpu->kvm->lock); + return r; +} + +int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&vcpu->kvm->lock); + + switch (data->type) { + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + if (vcpu->arch.xen.vcpu_info_set) + data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; + else + data->u.gpa = GPA_INVALID; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: + if (vcpu->arch.xen.vcpu_time_info_set) + data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; + else + data->u.gpa = GPA_INVALID; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + if (vcpu->arch.xen.runstate_set) { + data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; + r = 0; + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate.state = vcpu->arch.xen.current_runstate; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate.state = vcpu->arch.xen.current_runstate; + data->u.runstate.state_entry_time = + vcpu->arch.xen.runstate_entry_time; + data->u.runstate.time_running = + vcpu->arch.xen.runstate_times[RUNSTATE_running]; + data->u.runstate.time_runnable = + vcpu->arch.xen.runstate_times[RUNSTATE_runnable]; + data->u.runstate.time_blocked = + vcpu->arch.xen.runstate_times[RUNSTATE_blocked]; + data->u.runstate.time_offline = + vcpu->arch.xen.runstate_times[RUNSTATE_offline]; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: + r = -EINVAL; + break; + + default: + break; + } + + mutex_unlock(&vcpu->kvm->lock); + return r; +} + +int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u32 page_num = data & ~PAGE_MASK; + u64 page_addr = data & PAGE_MASK; + bool lm = is_long_mode(vcpu); + + /* Latch long_mode for shared_info pages etc. */ + vcpu->kvm->arch.xen.long_mode = lm; + + /* + * If Xen hypercall intercept is enabled, fill the hypercall + * page with VMCALL/VMMCALL instructions since that's what + * we catch. Else the VMM has provided the hypercall pages + * with instructions of its own choosing, so use those. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + u8 instructions[32]; + int i; + + if (page_num) + return 1; + + /* mov imm32, %eax */ + instructions[0] = 0xb8; + + /* vmcall / vmmcall */ + kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + + /* ret */ + instructions[8] = 0xc3; + + /* int3 to pad */ + memset(instructions + 9, 0xcc, sizeof(instructions) - 9); + + for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { + *(u32 *)&instructions[1] = i; + if (kvm_vcpu_write_guest(vcpu, + page_addr + (i * sizeof(instructions)), + instructions, sizeof(instructions))) + return 1; + } + } else { + /* + * Note, truncation is a non-issue as 'lm' is guaranteed to be + * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. + */ + hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 + : kvm->arch.xen_hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 + : kvm->arch.xen_hvm_config.blob_size_32; + u8 *page; + + if (page_num >= blob_size) + return 1; + + blob_addr += page_num * PAGE_SIZE; + + page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; + } + } + return 0; +} + +int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) +{ + if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) + return -EINVAL; + + /* + * With hypercall interception the kernel generates its own + * hypercall page so it must not be provided. + */ + if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) && + (xhc->blob_addr_32 || xhc->blob_addr_64 || + xhc->blob_size_32 || xhc->blob_size_64)) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + static_branch_inc(&kvm_xen_enabled.key); + else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); + + memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + + mutex_unlock(&kvm->lock); + return 0; +} + +void kvm_xen_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); +} + +static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) +{ + kvm_rax_write(vcpu, result); + return kvm_skip_emulated_instruction(vcpu); +} + +static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip))) + return 1; + + return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); +} + +int kvm_xen_hypercall(struct kvm_vcpu *vcpu) +{ + bool longmode; + u64 input, params[6]; + + input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); + + /* Hyper-V hypercalls get bit 31 set in EAX */ + if ((input & 0x80000000) && + kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + longmode = is_64_bit_mode(vcpu); + if (!longmode) { + params[0] = (u32)kvm_rbx_read(vcpu); + params[1] = (u32)kvm_rcx_read(vcpu); + params[2] = (u32)kvm_rdx_read(vcpu); + params[3] = (u32)kvm_rsi_read(vcpu); + params[4] = (u32)kvm_rdi_read(vcpu); + params[5] = (u32)kvm_rbp_read(vcpu); + } +#ifdef CONFIG_X86_64 + else { + params[0] = (u64)kvm_rdi_read(vcpu); + params[1] = (u64)kvm_rsi_read(vcpu); + params[2] = (u64)kvm_rdx_read(vcpu); + params[3] = (u64)kvm_r10_read(vcpu); + params[4] = (u64)kvm_r8_read(vcpu); + params[5] = (u64)kvm_r9_read(vcpu); + } +#endif + trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + params[3], params[4], params[5]); + + vcpu->run->exit_reason = KVM_EXIT_XEN; + vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; + vcpu->run->xen.u.hcall.longmode = longmode; + vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu); + vcpu->run->xen.u.hcall.input = input; + vcpu->run->xen.u.hcall.params[0] = params[0]; + vcpu->run->xen.u.hcall.params[1] = params[1]; + vcpu->run->xen.u.hcall.params[2] = params[2]; + vcpu->run->xen.u.hcall.params[3] = params[3]; + vcpu->run->xen.u.hcall.params[4] = params[4]; + vcpu->run->xen.u.hcall.params[5] = params[5]; + vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = + kvm_xen_hypercall_complete_userspace; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h new file mode 100644 index 000000000000..463a7844a8ca --- /dev/null +++ b/arch/x86/kvm/xen.h @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * KVM Xen emulation + */ + +#ifndef __ARCH_X86_KVM_XEN_H__ +#define __ARCH_X86_KVM_XEN_H__ + +#ifdef CONFIG_KVM_XEN +#include <linux/jump_label_ratelimit.h> + +extern struct static_key_false_deferred kvm_xen_enabled; + +int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); +int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); +int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); +int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_destroy_vm(struct kvm *kvm); + +static inline bool kvm_xen_msr_enabled(struct kvm *kvm) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + kvm->arch.xen_hvm_config.msr; +} + +static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + (kvm->arch.xen_hvm_config.flags & + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); +} + +static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) +{ + if (static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector) + return __kvm_xen_has_interrupt(vcpu); + + return 0; +} +#else +static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) +{ + return 1; +} + +static inline void kvm_xen_destroy_vm(struct kvm *kvm) +{ +} + +static inline bool kvm_xen_msr_enabled(struct kvm *kvm) +{ + return false; +} + +static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) +{ + return false; +} + +static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) +{ + return 0; +} +#endif + +int kvm_xen_hypercall(struct kvm_vcpu *vcpu); + +#include <asm/pvclock-abi.h> +#include <asm/xen/interface.h> +#include <xen/interface/vcpu.h> + +void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state); + +static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu) +{ + kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running); +} + +static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) +{ + /* + * If the vCPU wasn't preempted but took a normal exit for + * some reason (hypercalls, I/O, etc.), that is accounted as + * still RUNSTATE_running, as the VMM is still operating on + * behalf of the vCPU. Only if the VMM does actually block + * does it need to enter RUNSTATE_blocked. + */ + if (vcpu->preempted) + kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); +} + +/* 32-bit compatibility definitions, also used natively in 32-bit build */ +struct compat_arch_vcpu_info { + unsigned int cr2; + unsigned int pad[5]; +}; + +struct compat_vcpu_info { + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + uint16_t pad; + uint32_t evtchn_pending_sel; + struct compat_arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; +}; /* 64 bytes (x86) */ + +struct compat_arch_shared_info { + unsigned int max_pfn; + unsigned int pfn_to_mfn_frame_list_list; + unsigned int nmi_reason; + unsigned int p2m_cr3; + unsigned int p2m_vaddr; + unsigned int p2m_generation; + uint32_t wc_sec_hi; +}; + +struct compat_shared_info { + struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS]; + uint32_t evtchn_pending[32]; + uint32_t evtchn_mask[32]; + struct pvclock_wall_clock wc; + struct compat_arch_shared_info arch; +}; + +struct compat_vcpu_runstate_info { + int state; + uint64_t state_entry_time; + uint64_t time[4]; +} __attribute__((packed)); + +#endif /* __ARCH_X86_KVM_XEN_H__ */ diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index c13e8c9ee926..80efd45a7761 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -10,10 +10,6 @@ #include <asm/mce.h> #ifdef CONFIG_X86_MCE -/* - * See COPY_MC_TEST for self-test of the copy_mc_fragile() - * implementation. - */ static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index 892d8915f609..e5f77e293034 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -2,14 +2,11 @@ /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/linkage.h> -#include <asm/copy_mc_test.h> -#include <asm/export.h> #include <asm/asm.h> #ifndef CONFIG_UML #ifdef CONFIG_X86_MCE -COPY_MC_TEST_CTL /* * copy_mc_fragile - copy memory with indication if an exception / fault happened @@ -38,8 +35,6 @@ SYM_FUNC_START(copy_mc_fragile) subl %ecx, %edx .L_read_leading_bytes: movb (%rsi), %al - COPY_MC_TEST_SRC %rsi 1 .E_leading_bytes - COPY_MC_TEST_DST %rdi 1 .E_leading_bytes .L_write_leading_bytes: movb %al, (%rdi) incq %rsi @@ -55,8 +50,6 @@ SYM_FUNC_START(copy_mc_fragile) .L_read_words: movq (%rsi), %r8 - COPY_MC_TEST_SRC %rsi 8 .E_read_words - COPY_MC_TEST_DST %rdi 8 .E_write_words .L_write_words: movq %r8, (%rdi) addq $8, %rsi @@ -73,8 +66,6 @@ SYM_FUNC_START(copy_mc_fragile) movl %edx, %ecx .L_read_trailing_bytes: movb (%rsi), %al - COPY_MC_TEST_SRC %rsi 1 .E_trailing_bytes - COPY_MC_TEST_DST %rdi 1 .E_trailing_bytes .L_write_trailing_bytes: movb %al, (%rdi) incq %rsi @@ -88,7 +79,6 @@ SYM_FUNC_START(copy_mc_fragile) .L_done: ret SYM_FUNC_END(copy_mc_fragile) -EXPORT_SYMBOL_GPL(copy_mc_fragile) .section .fixup, "ax" /* diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..bb0b3fe1e0a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 404279563891..435630a6ec97 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else @@ -15,15 +16,28 @@ #include <asm/emulate_prefix.h> +#define leXX_to_cpu(t, r) \ +({ \ + __typeof__(t) v; \ + switch (sizeof(t)) { \ + case 4: v = le32_to_cpu(r); break; \ + case 2: v = le16_to_cpu(r); break; \ + case 1: v = r; break; \ + default: \ + BUILD_BUG(); break; \ + } \ + v; \ +}) + /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); r; }) + ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) @@ -147,9 +161,9 @@ found: b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) - prefixes->bytes[i] = b; + insn_set_byte(prefixes, i, b); } - insn->prefixes.bytes[3] = lb; + insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ @@ -157,8 +171,7 @@ found: b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { - insn->rex_prefix.value = b; - insn->rex_prefix.nbytes = 1; + insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ @@ -181,13 +194,13 @@ found: if (X86_MODRM_MOD(b2) != 3) goto vex_end; } - insn->vex_prefix.bytes[0] = b; - insn->vex_prefix.bytes[1] = b2; + insn_set_byte(&insn->vex_prefix, 0, b); + insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); - insn->vex_prefix.bytes[2] = b2; + insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); - insn->vex_prefix.bytes[3] = b2; + insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) @@ -195,7 +208,7 @@ found: insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); - insn->vex_prefix.bytes[2] = b2; + insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) @@ -207,7 +220,7 @@ found: * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ - insn->vex_prefix.bytes[2] = b2 & 0x7f; + insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } @@ -243,7 +256,7 @@ void insn_get_opcode(struct insn *insn) /* Get first opcode */ op = get_next(insn_byte_t, insn); - opcode->bytes[0] = op; + insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ @@ -295,8 +308,7 @@ void insn_get_modrm(struct insn *insn) if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); - modrm->value = mod; - modrm->nbytes = 1; + insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, @@ -334,7 +346,7 @@ int insn_rip_relative(struct insn *insn) * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ - return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); + return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** @@ -353,11 +365,11 @@ void insn_get_sib(struct insn *insn) if (!insn->modrm.got) insn_get_modrm(insn); if (insn->modrm.nbytes) { - modrm = (insn_byte_t)insn->modrm.value; + modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { - insn->sib.value = get_next(insn_byte_t, insn); - insn->sib.nbytes = 1; + insn_field_set(&insn->sib, + get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; @@ -407,19 +419,18 @@ void insn_get_displacement(struct insn *insn) if (mod == 3) goto out; if (mod == 1) { - insn->displacement.value = get_next(signed char, insn); - insn->displacement.nbytes = 1; + insn_field_set(&insn->displacement, + get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { - insn->displacement.value = - get_next(short, insn); - insn->displacement.nbytes = 2; + insn_field_set(&insn->displacement, + get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { - insn->displacement.value = get_next(int, insn); - insn->displacement.nbytes = 4; + insn_field_set(&insn->displacement, + get_next(int, insn), 4); } } } @@ -435,18 +446,14 @@ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: - insn->moffset1.value = get_next(short, insn); - insn->moffset1.nbytes = 2; + insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: - insn->moffset1.value = get_next(int, insn); - insn->moffset1.nbytes = 4; + insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: - insn->moffset1.value = get_next(int, insn); - insn->moffset1.nbytes = 4; - insn->moffset2.value = get_next(int, insn); - insn->moffset2.nbytes = 4; + insn_field_set(&insn->moffset1, get_next(int, insn), 4); + insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -464,13 +471,11 @@ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate.value = get_next(short, insn); - insn->immediate.nbytes = 2; + insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: - insn->immediate.value = get_next(int, insn); - insn->immediate.nbytes = 4; + insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -487,18 +492,15 @@ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate1.value = get_next(short, insn); - insn->immediate1.nbytes = 2; + insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: - insn->immediate1.value = get_next(int, insn); + insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; - insn->immediate2.value = get_next(int, insn); - insn->immediate2.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); + insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -515,12 +517,10 @@ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate1.value = get_next(short, insn); - insn->immediate1.nbytes = 2; + insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ @@ -528,8 +528,7 @@ static int __get_immptr(struct insn *insn) default: /* opnd_bytes must be modified manually */ goto err_out; } - insn->immediate2.value = get_next(unsigned short, insn); - insn->immediate2.nbytes = 2; + insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; @@ -565,22 +564,17 @@ void insn_get_immediate(struct insn *insn) switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: - insn->immediate.value = get_next(signed char, insn); - insn->immediate.nbytes = 1; + insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: - insn->immediate.value = get_next(short, insn); - insn->immediate.nbytes = 2; + insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: - insn->immediate.value = get_next(int, insn); - insn->immediate.nbytes = 4; + insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; - insn->immediate2.value = get_next(int, insn); - insn->immediate2.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); + insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) @@ -599,8 +593,7 @@ void insn_get_immediate(struct insn *insn) goto err_out; } if (inat_has_second_immediate(insn->attr)) { - insn->immediate2.value = get_next(signed char, insn); - insn->immediate2.nbytes = 1; + insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 4321fa02e18d..419365c48b2a 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -26,6 +26,16 @@ #include <asm/fpu/api.h> #include <asm/asm.h> +/* + * Use KFPU_387. MMX instructions are not affected by MXCSR, + * but both AMD and Intel documentation states that even integer MMX + * operations will result in #MF if an exception is pending in FCW. + * + * EMMS is not needed afterwards because, after calling kernel_fpu_end(), + * any subsequent user of the 387 stack will reinitialize it using + * KFPU_387. + */ + void *_mmx_memcpy(void *to, const void *from, size_t len) { void *p; @@ -37,7 +47,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) p = to; i = len >> 6; /* len/64 */ - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" /* This set is 28 bytes */ @@ -127,7 +137,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -160,7 +170,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); /* * maybe the prefetch stuff can go before the expensive fnsave... @@ -247,7 +257,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -282,7 +292,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index fee8b9c0520c..75a0915b0d01 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info) int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; - call_single_data_t csd = { - .func = __rdmsr_safe_on_cpu, - .info = &rv, - }; + call_single_data_t csd; int err; + INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); + memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index b4c43a9b1483..f6fb1d218dcc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -28,7 +28,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) jmp .Lspec_trap_\@ .Ldo_rop_\@: mov %\reg, (%_ASM_SP) - UNWIND_HINT_RET_OFFSET + UNWIND_HINT_FUNC ret SYM_FUNC_END(__x86_retpoline_\reg) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 3f435d7fca5e..c3e8a62ca561 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -9,9 +9,23 @@ #include <asm/tlbflush.h> -/* - * We rely on the nested NMI work to allow atomic faults from the NMI path; the - * nested NMI paths are careful to preserve CR2. +/** + * copy_from_user_nmi - NMI safe copy from user + * @to: Pointer to the destination buffer + * @from: Pointer to a user space address of the current task + * @n: Number of bytes to copy + * + * Returns: The number of not copied bytes. 0 is success, i.e. all bytes copied + * + * Contrary to other copy_from_user() variants this function can be called + * from NMI context. Despite the name it is not restricted to be called + * from NMI context. It is safe to be called from any other context as + * well. It disables pagefaults across the copy which means a fault will + * abort the copy. + * + * For NMI context invocations this relies on the nested NMI work to allow + * atomic faults from the NMI path; the nested NMI paths are careful to + * preserve CR2. */ unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) @@ -27,7 +41,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when - * called form other contexts. + * called from other contexts. */ pagefault_disable(); ret = __copy_from_user_inatomic(to, from, n); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82bf37a5c9ec..a73347e2cdfc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -9,6 +9,7 @@ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/memblock.h> /* max_low_pfn */ +#include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ @@ -16,7 +17,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ -#include <linux/efi.h> /* efi_recover_from_page_fault()*/ +#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ @@ -25,11 +26,12 @@ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/efi.h> /* efi_recover_from_page_fault()*/ +#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ +#include <asm/vdso.h> /* fixup_vdso_exception() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -53,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * @@ -82,11 +84,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, #ifdef CONFIG_X86_64 case 0x40: /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. + * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif @@ -109,6 +107,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, } } +static bool is_amd_k8_pre_npt(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && + c->x86_vendor == X86_VENDOR_AMD && + c->x86 == 0xf && c->x86_model < 0x40); +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { @@ -116,6 +123,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char *instr; int prefetch = 0; + /* Erratum #91 affects AMD K8, pre-NPT CPUs */ + if (!is_amd_k8_pre_npt()) + return 0; + /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: @@ -126,20 +137,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) - return 0; + /* + * This code has historically always bailed out if IP points to a + * not-present page (e.g. due to a race). No one has ever + * complained about this. + */ + pagefault_disable(); while (instr < max_instr) { unsigned char opcode; - if (get_kernel_nofault(opcode, instr)) - break; + if (user_mode(regs)) { + if (get_user(opcode, instr)) + break; + } else { + if (get_kernel_nofault(opcode, instr)) + break; + } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } + + pagefault_enable(); return prefetch; } @@ -261,25 +283,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -#ifdef CONFIG_VM86 - unsigned long bit; - - if (!v8086_mode(regs) || !tsk->thread.vm86) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.vm86->screen_bitmap |= 1 << bit; -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; @@ -334,15 +337,6 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -} - static int bad_address(void *p) { unsigned long dummy; @@ -426,6 +420,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) || boot_cpu_data.x86 != 0xf) return 0; + if (user_mode(regs)) + return 0; + if (address != regs->ip) return 0; @@ -461,10 +458,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) } /* Pentium F0 0F C7 C8 bug workaround: */ -static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && + idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } @@ -602,11 +601,9 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -static void set_signal_archinfo(unsigned long address, - unsigned long error_code) +static void sanitize_error_code(unsigned long address, + unsigned long *error_code) { - struct task_struct *tsk = current; - /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to @@ -617,7 +614,13 @@ static void set_signal_archinfo(unsigned long address, * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + *error_code |= X86_PF_PROT; +} + +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; @@ -625,51 +628,20 @@ static void set_signal_archinfo(unsigned long address, } static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) +page_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - struct task_struct *tsk = current; unsigned long flags; int sig; if (user_mode(regs)) { /* - * This is an implicit supervisor-mode access from user - * mode. Bypass all the kernel-mode recovery code and just - * OOPS. + * Implicit kernel access from user mode? Skip the stack + * overflow and EFI special cases. */ goto oops; } - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { - /* - * Any interrupt that takes a fault gets the fixup. This makes - * the below recursive fault logic only apply to a faults from - * task context. - */ - if (in_interrupt()) - return; - - /* - * Per the above we're !in_interrupt(), aka. task context. - * - * In this case we need to make sure we're not recursively - * faulting through the emulate_vsyscall() logic. - */ - if (current->thread.sig_on_uaccess_err && signal) { - set_signal_archinfo(address, error_code); - - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); - } - - /* - * Barring that, we can do the fixup and be happy. - */ - return; - } - #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial @@ -677,8 +649,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && - (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || - address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + (((unsigned long)current->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) { unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -702,29 +674,18 @@ no_context(struct pt_regs *regs, unsigned long error_code, #endif /* - * 32-bit: - * - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * 64-bit: - * - * Hall of shame of CPU/BIOS bugs. + * Buggy firmware could access regions which might page fault. If + * this happens, EFI has a special OOPS path that will try to + * avoid hanging the system. */ - if (is_prefetch(regs, error_code, address)) - return; + if (IS_ENABLED(CONFIG_EFI)) + efi_crash_gracefully_on_page_fault(address); - if (is_errata93(regs, address)) + /* Only not-present faults should be handled by KFENCE. */ + if (!(error_code & X86_PF_PROT) && + kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; - /* - * Buggy firmware could access regions which might page fault, try to - * recover from such faults. - */ - if (IS_ENABLED(CONFIG_EFI)) - efi_recover_from_page_fault(address); - oops: /* * Oops. The kernel tried to access some bad page. We'll have to @@ -734,7 +695,7 @@ oops: show_fault_oops(regs, error_code, address); - if (task_stack_end_corrupted(tsk)) + if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; @@ -747,6 +708,53 @@ oops: oops_end(flags, regs, sig); } +static noinline void +kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + WARN_ON_ONCE(user_mode(regs)); + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ + if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + + set_signal_archinfo(address, error_code); + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } + + /* + * Barring that, we can do the fixup and be happy. + */ + return; + } + + /* + * AMD erratum #91 manifests as a spurious page fault on a PREFETCH + * instruction. + */ + if (is_prefetch(regs, error_code, address)) + return; + + page_fault_oops(regs, error_code, address); +} + /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: @@ -789,50 +797,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs) && (error_code & X86_PF_USER)) { - /* - * It's possible to have interrupts off here: - */ - local_irq_enable(); + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + return; + } - /* - * Valid to do another page fault here because this one came - * from user space: - */ - if (is_prefetch(regs, error_code, address)) - return; + if (!(error_code & X86_PF_USER)) { + /* Implicit user access to kernel memory -- just oops */ + page_fault_oops(regs, error_code, address); + return; + } - if (is_errata100(regs, address)) - return; + /* + * User mode accesses just cause a SIGSEGV. + * It's possible to have interrupts off here: + */ + local_irq_enable(); - /* - * To avoid leaking information about the kernel page table - * layout, pretend that user-mode accesses to kernel addresses - * are always protection faults. - */ - if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; - if (likely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); + if (is_errata100(regs, address)) + return; - set_signal_archinfo(address, error_code); + sanitize_error_code(address, &error_code); - if (si_code == SEGV_PKUERR) - force_sig_pkuerr((void __user *)address, pkey); + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); - local_irq_disable(); + set_signal_archinfo(address, error_code); - return; - } + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); - if (is_f00f_bug(regs, address)) - return; + force_sig_fault(SIGSEGV, si_code, (void __user *)address); - no_context(regs, error_code, address, SIGSEGV, si_code); + local_irq_disable(); } static noinline void @@ -922,8 +929,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -931,6 +938,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE @@ -952,40 +964,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } -static noinline void -mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, vm_fault_t fault) -{ - if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, 0, 0); - return; - } - - if (fault & VM_FAULT_OOM) { - /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); - return; - } - - /* - * We ran out of memory, call the OOM killer, and return the - * userspace (which will retry the fault, or kill us if we got - * oom-killed): - */ - pagefault_out_of_memory(); - } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| - VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, fault); - else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address); - else - BUG(); - } -} - static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) @@ -1102,6 +1080,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 1; /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + + /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. @@ -1188,6 +1178,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } #endif + if (is_f00f_bug(regs, hw_error_code, address)) + return; + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1208,10 +1201,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* Handle faults in the user portion of the address space */ +/* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost + * all purposes, we should treat a normal kernel access to user memory + * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. + * The one exception is AC flag handling, which is, per the x86 + * architecture, special for WRUSS. + */ static inline void do_user_addr_fault(struct pt_regs *regs, - unsigned long hw_error_code, + unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; @@ -1223,6 +1223,21 @@ void do_user_addr_fault(struct pt_regs *regs, tsk = current; mm = tsk->mm; + if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { + /* + * Whoops, this is kernel mode code trying to execute from + * user memory. Unless this is AMD erratum #93, which + * corrupts RIP such that it looks like a user address, + * this is unrecoverable. Don't even try to look up the + * VMA or look for extable entries. + */ + if (is_errata93(regs, address)) + return; + + page_fault_oops(regs, error_code, address); + return; + } + /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; @@ -1231,8 +1246,8 @@ void do_user_addr_fault(struct pt_regs *regs, * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ - if (unlikely(hw_error_code & X86_PF_RSVD)) - pgtable_bad(regs, hw_error_code, address); + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user @@ -1242,10 +1257,13 @@ void do_user_addr_fault(struct pt_regs *regs, * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && - !(hw_error_code & X86_PF_USER) && - !(regs->flags & X86_EFLAGS_AC))) - { - bad_area_nosemaphore(regs, hw_error_code, address); + !(error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) { + /* + * No extable entry here. This was a kernel access to an + * invalid pointer. get_kernel_nofault() will not get here. + */ + page_fault_oops(regs, error_code, address); return; } @@ -1254,7 +1272,7 @@ void do_user_addr_fault(struct pt_regs *regs, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1275,9 +1293,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (hw_error_code & X86_PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (hw_error_code & X86_PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1293,7 +1311,7 @@ void do_user_addr_fault(struct pt_regs *regs, * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(hw_error_code, regs, address)) + if (emulate_vsyscall(error_code, regs, address)) return; } #endif @@ -1316,7 +1334,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } retry: @@ -1332,17 +1350,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } @@ -1351,8 +1369,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(hw_error_code, vma))) { - bad_area_access_error(regs, hw_error_code, address, vma); + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); return; } @@ -1371,11 +1389,14 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { + /* + * Quick path to respond to signals. The core mm code + * has unlocked the mm for us if we get here. + */ if (!user_mode(regs)) - no_context(regs, hw_error_code, address, SIGBUS, - BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR); return; } @@ -1391,12 +1412,37 @@ good_area: } mmap_read_unlock(mm); - if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, hw_error_code, address, fault); + if (likely(!(fault & VM_FAULT_ERROR))) + return; + + if (fatal_signal_pending(current) && !user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); return; } - check_v8086_mode(regs, address, tsk); + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); + else + BUG(); + } } NOKPROBE_SYMBOL(do_user_addr_fault); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 075fe51317b0..2c54b76d8f84 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,65 +4,6 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); - set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - return kmap_atomic_prot_pfn(pfn, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - arch_flush_lazy_mmu_mode(); - } -#ifdef CONFIG_DEBUG_HIGHMEM - else { - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); - } -#endif -} -EXPORT_SYMBOL(kunmap_atomic_high); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index fe7a12599d8e..968d7005f4a7 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -62,6 +62,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, unsigned long addr, unsigned long end) { unsigned long next; + int result; for (; addr < end; addr = next) { p4d_t *p4d = p4d_page + p4d_index(addr); @@ -73,13 +74,20 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (p4d_present(*p4d)) { pud = pud_offset(p4d, 0); - ident_pud_init(info, pud, addr, next); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; } pud = (pud_t *)info->alloc_pgt_page(info->context); if (!pud) return -ENOMEM; - ident_pud_init(info, pud, addr, next); + + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c7a47603537f..dd694fb93916 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num) } /* - * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. - * With KASLR memory randomization, depending on the machine e820 memory - * and the PUD alignment. We may need twice more pages when KASLR memory + * By default need to be able to allocate page tables below PGD firstly for + * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. + * With KASLR memory randomization, depending on the machine e820 memory and the + * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled. */ + +#ifndef CONFIG_X86_5LEVEL +#define INIT_PGD_PAGE_TABLES 3 +#else +#define INIT_PGD_PAGE_TABLES 4 +#endif + #ifndef CONFIG_RANDOMIZE_MEMORY -#define INIT_PGD_PAGE_COUNT 6 +#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) #else -#define INIT_PGD_PAGE_COUNT 12 +#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) #endif + #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) @@ -596,7 +605,7 @@ static unsigned long __init get_new_step_size(unsigned long step_size) static void __init memory_map_top_down(unsigned long map_start, unsigned long map_end) { - unsigned long real_end, start, last_start; + unsigned long real_end, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; @@ -609,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start, step_size = PMD_SIZE; max_pfn_mapped = 0; /* will get exact value next */ min_pfn_mapped = real_end >> PAGE_SHIFT; - last_start = start = real_end; + last_start = real_end; /* * We start from the top (end of memory) and go to the bottom. @@ -618,6 +627,8 @@ static void __init memory_map_top_down(unsigned long map_start, * for page table. */ while (last_start > map_start) { + unsigned long start; + if (last_start > step_size) { start = round_down(last_start - 1, step_size); if (start < map_start) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7c055259de3a..da31c2635ee4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -394,19 +394,6 @@ repeat: return last_map_addr; } -pte_t *kmap_pte; - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* - * Cache the first kmap pte: - */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} - #ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { @@ -712,8 +699,6 @@ void __init paging_init(void) __flush_tlb_all(); - kmap_init(); - /* * NOTE: at this point the bootmem allocator is fully available. */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index f60398aeb644..9aaa756ddf21 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} - -/* - * Map 'pfn' using protections 'prot' - */ -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, translate non-WB request to UC- just in @@ -81,36 +60,6 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) /* Filter out unsupported __PAGE_KERNEL* bits: */ pgprot_val(prot) &= __default_kernel_pte_mask; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); -} -EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); - -void -iounmap_atomic(void __iomem *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - } - - pagefault_enable(); - preempt_enable(); + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); } -EXPORT_SYMBOL_GPL(iounmap_atomic); +EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index bc0833713be9..ae78cef79980 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -198,6 +198,37 @@ void __init sme_early_init(void) swiotlb_force = SWIOTLB_FORCE; } +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!sev_active()) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); +} + static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) { pgprot_t old_prot, new_prot; @@ -231,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* @@ -351,6 +382,7 @@ bool sev_active(void) { return sev_status & MSR_AMD64_SEV_ENABLED; } +EXPORT_SYMBOL_GPL(sev_active); /* Needs to be called from non-instrumentable code */ bool noinstr sev_es_active(void) @@ -443,9 +475,10 @@ void __init mem_encrypt_init(void) swiotlb_update_mem_attributes(); /* - * With SEV, we need to unroll the rep string I/O instructions. + * With SEV, we need to unroll the rep string I/O instructions, + * but SEV-ES supports them through the #VC handler. */ - if (sev_active()) + if (sev_active() && !sev_es_active()) static_branch_enable(&sev_enable_key); print_mem_encrypt_feature_info(); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index bd7aff5c51f7..cd768dafca9e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -10,8 +10,6 @@ #define pr_fmt(fmt) "mmiotrace: " fmt -#define DEBUG 1 - #include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 8f665c352bf0..ca311aaa67b8 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1164,12 +1164,14 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + kfree(v); ++*pos; return memtype_get_idx(*pos); } static void memtype_seq_stop(struct seq_file *seq, void *v) { + kfree(v); } static int memtype_seq_show(struct seq_file *seq, void *v) @@ -1181,8 +1183,6 @@ static int memtype_seq_show(struct seq_file *seq, void *v) entry_print->end, cattr_name(entry_print->type)); - kfree(entry_print); - return 0; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 40baa90e74f4..16f878c26667 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2194,6 +2194,7 @@ int set_direct_map_default_noflush(struct page *page) return __set_pages_p(page, 1); } +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2225,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) arch_flush_lazy_mmu_mode(); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) { unsigned int level; @@ -2238,7 +2239,6 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } -#endif /* CONFIG_HIBERNATION */ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfd82f51ba66..f6a9e2e36642 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -829,6 +829,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } free_page((unsigned long)pmd_sv); + + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); return 1; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 796506dcfc42..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +/* Some 1-byte opcodes for binary ALU operations */ +static u8 simple_alu_opcodes[] = { + [BPF_ADD] = 0x01, + [BPF_SUB] = 0x29, + [BPF_AND] = 0x21, + [BPF_OR] = 0x09, + [BPF_XOR] = 0x31, + [BPF_LSH] = 0xE0, + [BPF_RSH] = 0xE8, + [BPF_ARSH] = 0xF8, +}; + static void jit_fill_hole(void *area, unsigned int size) { /* Fill whole space with INT3 instructions */ @@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ +static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_imm8(off)) { + /* 1-byte signed displacement. + * + * If off == 0 we could skip this and save one extra byte, but + * special case of x86 R13 which always needs an offset is not + * worth the hassle + */ + EMIT2(add_2reg(0x40, ptr_reg, val_reg), off); + } else { + /* 4-byte signed displacement */ + EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off); + } + *pprog = prog; +} + +/* + * Emit a REX byte if it will be necessary to address these registers + */ +static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) + EMIT1(add_2mod(0x48, dst_reg, src_reg)); + else if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); break; } - /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + emit_insn_suffix(&prog, src_reg, dst_reg, off); *pprog = prog; } @@ -751,13 +791,53 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); break; } - if (is_imm8(off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + emit_insn_suffix(&prog, dst_reg, src_reg, off); *pprog = prog; } +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + + *pprog = prog; + return 0; +} + static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } +static int emit_nops(u8 **pprog, int len) +{ + u8 *prog = *pprog; + int i, noplen, cnt = 0; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(ideal_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; + + return cnt; +} + +#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) + int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; - int proglen = 0; + int ilen, proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_prog->aux->func_idx != 0); push_callee_regs(&prog, callee_regs_used); - addrs[0] = prog - temp; + + ilen = prog - temp; + if (image) + memcpy(image + proglen, temp, ilen); + proglen += ilen; + addrs[0] = proglen; + prog = temp; for (i = 1; i <= insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u8 b2 = 0, b3 = 0; + u8 *start_of_ldx; s64 jmp_offset; u8 jmp_cond; - int ilen; u8 *func; + int nops; switch (insn->code) { /* ALU */ @@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_AND | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: - switch (BPF_OP(insn->code)) { - case BPF_ADD: b2 = 0x01; break; - case BPF_SUB: b2 = 0x29; break; - case BPF_AND: b2 = 0x21; break; - case BPF_OR: b2 = 0x09; break; - case BPF_XOR: b2 = 0x31; break; - } - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_ALU64); + b2 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; @@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } - + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; if (imm32 == 1) EMIT2(0xD1, add_1reg(b3, dst_reg)); else @@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); if (src_reg != BPF_REG_4) @@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + /* test src_reg, src_reg */ + maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ + EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); + /* jne start_of_ldx */ + EMIT2(X86_JNE, 0); + /* xor dst_reg, dst_reg */ + emit_mov_imm32(&prog, false, dst_reg, 0); + /* jmp byte_after_ldx */ + EMIT2(0xEB, 0); + + /* populate jmp_offset for JNE above */ + temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + start_of_ldx = prog; + } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; u8 *_insn = image + proglen; s64 delta; + /* populate jmp_offset for JMP above */ + start_of_ldx[-1] = prog - start_of_ldx; + if (!bpf_prog->aux->extable) break; @@ -1230,21 +1342,60 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + u32 real_src_reg = src_reg; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + if (src_reg == BPF_REG_0) + real_src_reg = BPF_REG_AX; + + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, real_src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ @@ -1295,20 +1446,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSGE | BPF_X: case BPF_JMP32 | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: /* test dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; @@ -1344,10 +1491,8 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLE | BPF_K: /* test dst_reg, dst_reg to save one extra byte */ if (imm32 == 0) { - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + maybe_emit_mod(&prog, dst_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); goto emit_cond_jmp; } @@ -1409,6 +1554,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ } jmp_offset = addrs[i + insn->off] - addrs[i]; if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To keep the jmp_offset valid, the extra bytes are + * padded before the jump insn, so we substract the + * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp_cond, then this BPF insn won't shrink, so + * "nops" is 0. + * + * On the other hand, if the previous pass emits an + * imm32 jmp_cond, the extra 4 bytes(*) is padded to + * keep the image from shrinking further. + * + * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond + * is 2 bytes, so the size difference is 4 bytes. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 4) { + pr_err("unexpected jmp_cond padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); @@ -1431,11 +1600,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ else jmp_offset = addrs[i + insn->off] - addrs[i]; - if (!jmp_offset) - /* Optimize out nop jumps */ + if (!jmp_offset) { + /* + * If jmp_padding is enabled, the extra nops will + * be inserted. Otherwise, optimize out nop jumps. + */ + if (jmp_padding) { + /* There are 3 possible conditions. + * (1) This BPF_JA is already optimized out in + * the previous run, so there is no need + * to pad any extra byte (0 byte). + * (2) The previous pass emits an imm8 jmp, + * so we pad 2 bytes to match the previous + * insn size. + * (3) Similarly, the previous pass emits an + * imm32 jmp, and 5 bytes is padded. + */ + nops = INSN_SZ_DIFF; + if (nops != 0 && nops != 2 && nops != 5) { + pr_err("unexpected nop jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } break; + } emit_jmp: if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To avoid breaking jmp_offset, the extra bytes + * are padded before the actual jmp insn, so + * 2 bytes is substracted from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp, there is nothing to pad (0 byte). + * + * If it emits an imm32 jmp (5 bytes) previously + * and now an imm8 jmp (2 bytes), then we pad + * (5 - 2 = 3) bytes to stop the image from + * shrinking further. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 3) { + pr_err("unexpected jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT1_off32(0xE9, jmp_offset); @@ -1531,17 +1744,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) - return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1561,43 +1782,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) - return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } *pprog = prog; return 0; } -static void emit_nops(u8 **pprog, unsigned int len) -{ - unsigned int i, noplen; - u8 *prog = *pprog; - int cnt = 0; - - while (len > 0) { - noplen = len; - - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - - for (i = 0; i < noplen; i++) - EMIT1(ideal_nops[noplen][i]); - len -= noplen; - } - - *pprog = prog; -} - static void emit_align(u8 **pprog, u32 align) { u8 *target, *prog = *pprog; @@ -1735,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1774,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1792,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -1802,6 +2011,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -1832,9 +2044,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -1972,6 +2192,9 @@ struct x64_jit_data { struct jit_context ctx; }; +#define MAX_PASSES 20 +#define PADDING_PASSES (MAX_PASSES - 5) + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -1981,6 +2204,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct jit_context ctx = {}; bool tmp_blinded = false; bool extra_pass = false; + bool padding = false; u8 *image = NULL; int *addrs; int pass; @@ -2017,9 +2241,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image = jit_data->image; header = jit_data->header; extra_pass = true; + padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2042,8 +2267,10 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image. */ - for (pass = 0; pass < 20 || image; pass++) { - proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + for (pass = 0; pass < MAX_PASSES || image; pass++) { + if (!padding && pass >= PADDING_PASSES) + padding = true; + proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; @@ -2109,7 +2336,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile deleted file mode 100644 index 4d49b5a27025..000000000000 --- a/arch/x86/oprofile/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_OPROFILE) += oprofile.o - -DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ - oprof.o cpu_buffer.o buffer_sync.o \ - event_buffer.o oprofile_files.o \ - oprofilefs.o oprofile_stats.o \ - timer_int.o nmi_timer_int.o ) - -oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ - op_model_ppro.o op_model_p4.o diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c deleted file mode 100644 index a2488b6e27d6..000000000000 --- a/arch/x86/oprofile/backtrace.c +++ /dev/null @@ -1,127 +0,0 @@ -/** - * @file backtrace.c - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author David Smith - */ - -#include <linux/oprofile.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/compat.h> -#include <linux/uaccess.h> - -#include <asm/ptrace.h> -#include <asm/stacktrace.h> -#include <asm/unwind.h> - -#ifdef CONFIG_COMPAT -static struct stack_frame_ia32 * -dump_user_backtrace_32(struct stack_frame_ia32 *head) -{ - /* Also check accessibility of one struct frame_head beyond: */ - struct stack_frame_ia32 bufhead[2]; - struct stack_frame_ia32 *fp; - unsigned long bytes; - - bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != 0) - return NULL; - - fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); - - oprofile_add_trace(bufhead[0].return_address); - - /* frame pointers should strictly progress back up the stack - * (towards higher addresses) */ - if (head >= fp) - return NULL; - - return fp; -} - -static inline int -x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) -{ - struct stack_frame_ia32 *head; - - /* User process is IA32 */ - if (!current || !test_thread_flag(TIF_IA32)) - return 0; - - head = (struct stack_frame_ia32 *) regs->bp; - while (depth-- && head) - head = dump_user_backtrace_32(head); - - return 1; -} - -#else -static inline int -x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) -{ - return 0; -} -#endif /* CONFIG_COMPAT */ - -static struct stack_frame *dump_user_backtrace(struct stack_frame *head) -{ - /* Also check accessibility of one struct frame_head beyond: */ - struct stack_frame bufhead[2]; - unsigned long bytes; - - bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != 0) - return NULL; - - oprofile_add_trace(bufhead[0].return_address); - - /* frame pointers should strictly progress back up the stack - * (towards higher addresses) */ - if (head >= bufhead[0].next_frame) - return NULL; - - return bufhead[0].next_frame; -} - -void -x86_backtrace(struct pt_regs * const regs, unsigned int depth) -{ - struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); - - if (!user_mode(regs)) { - struct unwind_state state; - unsigned long addr; - - if (!depth) - return; - - oprofile_add_trace(regs->ip); - - if (!--depth) - return; - - for (unwind_start(&state, current, regs, NULL); - !unwind_done(&state); unwind_next_frame(&state)) { - addr = unwind_get_return_address(&state); - if (!addr) - break; - - oprofile_add_trace(addr); - - if (!--depth) - break; - } - - return; - } - - if (x86_backtrace_32(regs, depth)) - return; - - while (depth-- && head) - head = dump_user_backtrace(head); -} diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c deleted file mode 100644 index 9e138d00ad36..000000000000 --- a/arch/x86/oprofile/init.c +++ /dev/null @@ -1,38 +0,0 @@ -/** - * @file init.c - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon <levon@movementarian.org> - */ - -#include <linux/oprofile.h> -#include <linux/init.h> -#include <linux/errno.h> - -/* - * We support CPUs that have performance counters like the Pentium Pro - * with the NMI mode driver. - */ - -#ifdef CONFIG_X86_LOCAL_APIC -extern int op_nmi_init(struct oprofile_operations *ops); -extern void op_nmi_exit(void); -#else -static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } -static void op_nmi_exit(void) { } -#endif - -extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); - -int __init oprofile_arch_init(struct oprofile_operations *ops) -{ - ops->backtrace = x86_backtrace; - return op_nmi_init(ops); -} - -void oprofile_arch_exit(void) -{ - op_nmi_exit(); -} diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c deleted file mode 100644 index a7a7677265b6..000000000000 --- a/arch/x86/oprofile/nmi_int.c +++ /dev/null @@ -1,780 +0,0 @@ -/** - * @file nmi_int.c - * - * @remark Copyright 2002-2009 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon <levon@movementarian.org> - * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf <barry.kasindorf@amd.com> - * @author Jason Yeh <jason.yeh@amd.com> - * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#include <linux/init.h> -#include <linux/notifier.h> -#include <linux/smp.h> -#include <linux/oprofile.h> -#include <linux/syscore_ops.h> -#include <linux/slab.h> -#include <linux/moduleparam.h> -#include <linux/kdebug.h> -#include <linux/cpu.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/apic.h> - -#include "op_counter.h" -#include "op_x86_model.h" - -static struct op_x86_model_spec *model; -static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); -static DEFINE_PER_CPU(unsigned long, saved_lvtpc); - -/* must be protected with get_online_cpus()/put_online_cpus(): */ -static int nmi_enabled; -static int ctr_running; - -struct op_counter_config counter_config[OP_MAX_COUNTER]; - -/* common functions */ - -u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, - struct op_counter_config *counter_config) -{ - u64 val = 0; - u16 event = (u16)counter_config->event; - - val |= ARCH_PERFMON_EVENTSEL_INT; - val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; - val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; - val |= (counter_config->unit_mask & 0xFF) << 8; - counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_CMASK); - val |= counter_config->extra; - event &= model->event_mask ? model->event_mask : 0xFF; - val |= event & 0xFF; - val |= (u64)(event & 0x0F00) << 24; - - return val; -} - - -static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) -{ - if (ctr_running) - model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs)); - else if (!nmi_enabled) - return NMI_DONE; - else - model->stop(this_cpu_ptr(&cpu_msrs)); - return NMI_HANDLED; -} - -static void nmi_cpu_save_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *controls = msrs->controls; - unsigned int i; - - for (i = 0; i < model->num_counters; ++i) { - if (counters[i].addr) - rdmsrl(counters[i].addr, counters[i].saved); - } - - for (i = 0; i < model->num_controls; ++i) { - if (controls[i].addr) - rdmsrl(controls[i].addr, controls[i].saved); - } -} - -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); - if (!msrs->controls) - WARN_ON_ONCE(1); - else - model->start(msrs); -} - -static int nmi_start(void) -{ - get_online_cpus(); - ctr_running = 1; - /* make ctr_running visible to the nmi handler: */ - smp_mb(); - on_each_cpu(nmi_cpu_start, NULL, 1); - put_online_cpus(); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); - if (!msrs->controls) - WARN_ON_ONCE(1); - else - model->stop(msrs); -} - -static void nmi_stop(void) -{ - get_online_cpus(); - on_each_cpu(nmi_cpu_stop, NULL, 1); - ctr_running = 0; - put_online_cpus(); -} - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static DEFINE_PER_CPU(int, switch_index); - -static inline int has_mux(void) -{ - return !!model->switch_ctrl; -} - -inline int op_x86_phys_to_virt(int phys) -{ - return __this_cpu_read(switch_index) + phys; -} - -inline int op_x86_virt_to_phys(int virt) -{ - return virt % model->num_counters; -} - -static void nmi_shutdown_mux(void) -{ - int i; - - if (!has_mux()) - return; - - for_each_possible_cpu(i) { - kfree(per_cpu(cpu_msrs, i).multiplex); - per_cpu(cpu_msrs, i).multiplex = NULL; - per_cpu(switch_index, i) = 0; - } -} - -static int nmi_setup_mux(void) -{ - size_t multiplex_size = - sizeof(struct op_msr) * model->num_virt_counters; - int i; - - if (!has_mux()) - return 1; - - for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).multiplex = - kzalloc(multiplex_size, GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).multiplex) - return 0; - } - - return 1; -} - -static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) -{ - int i; - struct op_msr *multiplex = msrs->multiplex; - - if (!has_mux()) - return; - - for (i = 0; i < model->num_virt_counters; ++i) { - if (counter_config[i].enabled) { - multiplex[i].saved = -(u64)counter_config[i].count; - } else { - multiplex[i].saved = 0; - } - } - - per_cpu(switch_index, cpu) = 0; -} - -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (counters[i].addr) - rdmsrl(counters[i].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (counters[i].addr) - wrmsrl(counters[i].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_counters; - if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, cpu) = 0; - else - per_cpu(switch_index, cpu) = si; - - model->switch_ctrl(model, msrs); - nmi_cpu_restore_mpx_registers(msrs); - - nmi_cpu_start(NULL); -} - - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (!has_mux()) - return -ENOSYS; /* not implemented */ - if (nmi_multiplex_on() < 0) - return -EINVAL; /* not necessary */ - - get_online_cpus(); - if (ctr_running) - on_each_cpu(nmi_cpu_switch, NULL, 1); - put_online_cpus(); - - return 0; -} - -static inline void mux_init(struct oprofile_operations *ops) -{ - if (has_mux()) - ops->switch_events = nmi_switch_event; -} - -static void mux_clone(int cpu) -{ - if (!has_mux()) - return; - - memcpy(per_cpu(cpu_msrs, cpu).multiplex, - per_cpu(cpu_msrs, 0).multiplex, - sizeof(struct op_msr) * model->num_virt_counters); -} - -#else - -inline int op_x86_phys_to_virt(int phys) { return phys; } -inline int op_x86_virt_to_phys(int virt) { return virt; } -static inline void nmi_shutdown_mux(void) { } -static inline int nmi_setup_mux(void) { return 1; } -static inline void -nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } -static inline void mux_init(struct oprofile_operations *ops) { } -static void mux_clone(int cpu) { } - -#endif - -static void free_msrs(void) -{ - int i; - for_each_possible_cpu(i) { - kfree(per_cpu(cpu_msrs, i).counters); - per_cpu(cpu_msrs, i).counters = NULL; - kfree(per_cpu(cpu_msrs, i).controls); - per_cpu(cpu_msrs, i).controls = NULL; - } - nmi_shutdown_mux(); -} - -static int allocate_msrs(void) -{ - size_t controls_size = sizeof(struct op_msr) * model->num_controls; - size_t counters_size = sizeof(struct op_msr) * model->num_counters; - - int i; - for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) - goto fail; - per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) - goto fail; - } - - if (!nmi_setup_mux()) - goto fail; - - return 1; - -fail: - free_msrs(); - return 0; -} - -static void nmi_cpu_setup(void) -{ - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_save_registers(msrs); - raw_spin_lock(&oprofilefs_lock); - model->setup_ctrs(model, msrs); - nmi_cpu_setup_mux(cpu, msrs); - raw_spin_unlock(&oprofilefs_lock); - per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, APIC_DM_NMI); -} - -static void nmi_cpu_restore_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *controls = msrs->controls; - unsigned int i; - - for (i = 0; i < model->num_controls; ++i) { - if (controls[i].addr) - wrmsrl(controls[i].addr, controls[i].saved); - } - - for (i = 0; i < model->num_counters; ++i) { - if (counters[i].addr) - wrmsrl(counters[i].addr, counters[i].saved); - } -} - -static void nmi_cpu_shutdown(void) -{ - unsigned int v; - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - /* restoring APIC_LVTPC can trigger an apic error because the delivery - * mode and vector nr combination can be illegal. That's by design: on - * power on apic lvt contain a zero vector nr which are legal only for - * NMI delivery mode. So inhibit apic err before restoring lvtpc - */ - v = apic_read(APIC_LVTERR); - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); - apic_write(APIC_LVTERR, v); - nmi_cpu_restore_registers(msrs); -} - -static int nmi_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - if (nmi_enabled) - nmi_cpu_setup(); - if (ctr_running) - nmi_cpu_start(NULL); - local_irq_enable(); - return 0; -} - -static int nmi_cpu_down_prep(unsigned int cpu) -{ - local_irq_disable(); - if (ctr_running) - nmi_cpu_stop(NULL); - if (nmi_enabled) - nmi_cpu_shutdown(); - local_irq_enable(); - return 0; -} - -static int nmi_create_files(struct dentry *root) -{ - unsigned int i; - - for (i = 0; i < model->num_virt_counters; ++i) { - struct dentry *dir; - char buf[4]; - - /* quick little hack to _not_ expose a counter if it is not - * available for use. This should protect userspace app. - * NOTE: assumes 1:1 mapping here (that counters are organized - * sequentially in their struct assignment). - */ - if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) - continue; - - snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(root, buf); - oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); - } - - return 0; -} - -static enum cpuhp_state cpuhp_nmi_online; - -static int nmi_setup(void) -{ - int err = 0; - int cpu; - - if (!allocate_msrs()) - return -ENOMEM; - - /* We need to serialize save and setup for HT because the subset - * of msrs are distinct for save and setup operations - */ - - /* Assume saved/restored counters are the same on all CPUs */ - err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); - if (err) - goto fail; - - for_each_possible_cpu(cpu) { - if (!IS_ENABLED(CONFIG_SMP) || !cpu) - continue; - - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - - mux_clone(cpu); - } - - nmi_enabled = 0; - ctr_running = 0; - /* make variables visible to the nmi handler: */ - smp_mb(); - err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify, - 0, "oprofile"); - if (err) - goto fail; - - nmi_enabled = 1; - /* make nmi_enabled visible to the nmi handler: */ - smp_mb(); - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online", - nmi_cpu_online, nmi_cpu_down_prep); - if (err < 0) - goto fail_nmi; - cpuhp_nmi_online = err; - return 0; -fail_nmi: - unregister_nmi_handler(NMI_LOCAL, "oprofile"); -fail: - free_msrs(); - return err; -} - -static void nmi_shutdown(void) -{ - struct op_msrs *msrs; - - cpuhp_remove_state(cpuhp_nmi_online); - nmi_enabled = 0; - ctr_running = 0; - - /* make variables visible to the nmi handler: */ - smp_mb(); - unregister_nmi_handler(NMI_LOCAL, "oprofile"); - msrs = &get_cpu_var(cpu_msrs); - model->shutdown(msrs); - free_msrs(); - put_cpu_var(cpu_msrs); -} - -#ifdef CONFIG_PM - -static int nmi_suspend(void) -{ - /* Only one CPU left, just stop that one */ - if (nmi_enabled == 1) - nmi_cpu_stop(NULL); - return 0; -} - -static void nmi_resume(void) -{ - if (nmi_enabled == 1) - nmi_cpu_start(NULL); -} - -static struct syscore_ops oprofile_syscore_ops = { - .resume = nmi_resume, - .suspend = nmi_suspend, -}; - -static void __init init_suspend_resume(void) -{ - register_syscore_ops(&oprofile_syscore_ops); -} - -static void exit_suspend_resume(void) -{ - unregister_syscore_ops(&oprofile_syscore_ops); -} - -#else - -static inline void init_suspend_resume(void) { } -static inline void exit_suspend_resume(void) { } - -#endif /* CONFIG_PM */ - -static int __init p4_init(char **cpu_type) -{ - __u8 cpu_model = boot_cpu_data.x86_model; - - if (cpu_model > 6 || cpu_model == 5) - return 0; - -#ifndef CONFIG_SMP - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; -#else - switch (smp_num_siblings) { - case 1: - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; - - case 2: - *cpu_type = "i386/p4-ht"; - model = &op_p4_ht2_spec; - return 1; - } -#endif - - printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); - printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); - return 0; -} - -enum __force_cpu_type { - reserved = 0, /* do not force */ - timer, - arch_perfmon, -}; - -static int force_cpu_type; - -static int set_cpu_type(const char *str, const struct kernel_param *kp) -{ - if (!strcmp(str, "timer")) { - force_cpu_type = timer; - printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); - } else if (!strcmp(str, "arch_perfmon")) { - force_cpu_type = arch_perfmon; - printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); - } else { - force_cpu_type = 0; - } - - return 0; -} -module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); - -static int __init ppro_init(char **cpu_type) -{ - __u8 cpu_model = boot_cpu_data.x86_model; - struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - - if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) - return 0; - - /* - * Documentation on identifying Intel processors by CPU family - * and model can be found in the Intel Software Developer's - * Manuals (SDM): - * - * http://www.intel.com/products/processor/manuals/ - * - * As of May 2010 the documentation for this was in the: - * "Intel 64 and IA-32 Architectures Software Developer's - * Manual Volume 3B: System Programming Guide", "Table B-1 - * CPUID Signature Values of DisplayFamily_DisplayModel". - */ - switch (cpu_model) { - case 0 ... 2: - *cpu_type = "i386/ppro"; - break; - case 3 ... 5: - *cpu_type = "i386/pii"; - break; - case 6 ... 8: - case 10 ... 11: - *cpu_type = "i386/piii"; - break; - case 9: - case 13: - *cpu_type = "i386/p6_mobile"; - break; - case 14: - *cpu_type = "i386/core"; - break; - case 0x0f: - case 0x16: - case 0x17: - case 0x1d: - *cpu_type = "i386/core_2"; - break; - case 0x1a: - case 0x1e: - case 0x2e: - spec = &op_arch_perfmon_spec; - *cpu_type = "i386/core_i7"; - break; - case 0x1c: - *cpu_type = "i386/atom"; - break; - default: - /* Unknown */ - return 0; - } - - model = spec; - return 1; -} - -int __init op_nmi_init(struct oprofile_operations *ops) -{ - __u8 vendor = boot_cpu_data.x86_vendor; - __u8 family = boot_cpu_data.x86; - char *cpu_type = NULL; - int ret = 0; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return -ENODEV; - - if (force_cpu_type == timer) - return -ENODEV; - - switch (vendor) { - case X86_VENDOR_AMD: - /* Needs to be at least an Athlon (or hammer in 32bit mode) */ - - switch (family) { - case 6: - cpu_type = "i386/athlon"; - break; - case 0xf: - /* - * Actually it could be i386/hammer too, but - * give user space an consistent name. - */ - cpu_type = "x86-64/hammer"; - break; - case 0x10: - cpu_type = "x86-64/family10"; - break; - case 0x11: - cpu_type = "x86-64/family11h"; - break; - case 0x12: - cpu_type = "x86-64/family12h"; - break; - case 0x14: - cpu_type = "x86-64/family14h"; - break; - case 0x15: - cpu_type = "x86-64/family15h"; - break; - default: - return -ENODEV; - } - model = &op_amd_spec; - break; - - case X86_VENDOR_INTEL: - switch (family) { - /* Pentium IV */ - case 0xf: - p4_init(&cpu_type); - break; - - /* A P6-class processor */ - case 6: - ppro_init(&cpu_type); - break; - - default: - break; - } - - if (cpu_type) - break; - - if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) - return -ENODEV; - - /* use arch perfmon as fallback */ - cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - break; - - default: - return -ENODEV; - } - - /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; - - if (model->init) - ret = model->init(ops); - if (ret) - return ret; - - if (!model->num_virt_counters) - model->num_virt_counters = model->num_counters; - - mux_init(ops); - - init_suspend_resume(); - - printk(KERN_INFO "oprofile: using NMI interrupt.\n"); - return 0; -} - -void op_nmi_exit(void) -{ - exit_suspend_resume(); -} diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h deleted file mode 100644 index 0b7b7b179cbe..000000000000 --- a/arch/x86/oprofile/op_counter.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * @file op_counter.h - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - */ - -#ifndef OP_COUNTER_H -#define OP_COUNTER_H - -#define OP_MAX_COUNTER 32 - -/* Per-perfctr configuration as set via - * oprofilefs. - */ -struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; - unsigned long extra; -}; - -extern struct op_counter_config counter_config[]; - -#endif /* OP_COUNTER_H */ diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c deleted file mode 100644 index 660a83c8287b..000000000000 --- a/arch/x86/oprofile/op_model_amd.c +++ /dev/null @@ -1,542 +0,0 @@ -/* - * @file op_model_amd.c - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002-2009 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf <barry.kasindorf@amd.com> - * @author Jason Yeh <jason.yeh@amd.com> - * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#include <linux/oprofile.h> -#include <linux/device.h> -#include <linux/pci.h> -#include <linux/percpu.h> - -#include <asm/ptrace.h> -#include <asm/msr.h> -#include <asm/nmi.h> -#include <asm/apic.h> -#include <asm/processor.h> - -#include "op_x86_model.h" -#include "op_counter.h" - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 -#else -#define NUM_VIRT_COUNTERS 0 -#endif - -#define OP_EVENT_MASK 0x0FFF -#define OP_CTR_OVERFLOW (1ULL<<31) - -#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) - -static int num_counters; -static unsigned long reset_value[OP_MAX_COUNTER]; - -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 - -static u32 ibs_caps; - -struct ibs_config { - unsigned long op_enabled; - unsigned long fetch_enabled; - unsigned long max_cnt_fetch; - unsigned long max_cnt_op; - unsigned long rand_en; - unsigned long dispatched_ops; - unsigned long branch_target; -}; - -struct ibs_state { - u64 ibs_op_ctl; - int branch_target; - unsigned long sample_size; -}; - -static struct ibs_config ibs_config; -static struct ibs_state ibs_state; - -/* - * IBS randomization macros - */ -#define IBS_RANDOM_BITS 12 -#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) -#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) - -/* - * 16-bit Linear Feedback Shift Register (LFSR) - * - * 16 14 13 11 - * Feedback polynomial = X + X + X + X + 1 - */ -static unsigned int lfsr_random(void) -{ - static unsigned int lfsr_value = 0xF00D; - unsigned int bit; - - /* Compute next bit to shift in */ - bit = ((lfsr_value >> 0) ^ - (lfsr_value >> 2) ^ - (lfsr_value >> 3) ^ - (lfsr_value >> 5)) & 0x0001; - - /* Advance to next register value */ - lfsr_value = (lfsr_value >> 1) | (bit << 15); - - return lfsr_value; -} - -/* - * IBS software randomization - * - * The IBS periodic op counter is randomized in software. The lower 12 - * bits of the 20 bit counter are randomized. IbsOpCurCnt is - * initialized with a 12 bit random value. - */ -static inline u64 op_amd_randomize_ibs_op(u64 val) -{ - unsigned int random = lfsr_random(); - - if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) - /* - * Work around if the hw can not write to IbsOpCurCnt - * - * Randomize the lower 8 bits of the 16 bit - * IbsOpMaxCnt [15:0] value in the range of -128 to - * +127 by adding/subtracting an offset to the - * maximum count (IbsOpMaxCnt). - * - * To avoid over or underflows and protect upper bits - * starting at bit 16, the initial value for - * IbsOpMaxCnt must fit in the range from 0x0081 to - * 0xff80. - */ - val += (s8)(random >> 4); - else - val |= (u64)(random & IBS_RANDOM_MASK) << 32; - - return val; -} - -static inline void -op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val, ctl; - struct op_entry entry; - - if (!ibs_caps) - return; - - if (ibs_config.fetch_enabled) { - rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); - if (ctl & IBS_FETCH_VAL) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); - oprofile_write_reserve(&entry, regs, val, - IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data64(&entry, val); - oprofile_add_data64(&entry, ctl); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); - oprofile_add_data64(&entry, val); - oprofile_write_commit(&entry); - - /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); - ctl |= IBS_FETCH_ENABLE; - wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); - } - } - - if (ibs_config.op_enabled) { - rdmsrl(MSR_AMD64_IBSOPCTL, ctl); - if (ctl & IBS_OP_VAL) { - rdmsrl(MSR_AMD64_IBSOPRIP, val); - oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, - ibs_state.sample_size); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA2, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA3, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSDCLINAD, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); - oprofile_add_data64(&entry, val); - if (ibs_state.branch_target) { - rdmsrl(MSR_AMD64_IBSBRTARGET, val); - oprofile_add_data(&entry, (unsigned long)val); - } - oprofile_write_commit(&entry); - - /* reenable the IRQ */ - ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); - wrmsrl(MSR_AMD64_IBSOPCTL, ctl); - } - } -} - -static inline void op_amd_start_ibs(void) -{ - u64 val; - - if (!ibs_caps) - return; - - memset(&ibs_state, 0, sizeof(ibs_state)); - - /* - * Note: Since the max count settings may out of range we - * write back the actual used values so that userland can read - * it. - */ - - if (ibs_config.fetch_enabled) { - val = ibs_config.max_cnt_fetch >> 4; - val = min(val, IBS_FETCH_MAX_CNT); - ibs_config.max_cnt_fetch = val << 4; - val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; - val |= IBS_FETCH_ENABLE; - wrmsrl(MSR_AMD64_IBSFETCHCTL, val); - } - - if (ibs_config.op_enabled) { - val = ibs_config.max_cnt_op >> 4; - if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { - /* - * IbsOpCurCnt not supported. See - * op_amd_randomize_ibs_op() for details. - */ - val = clamp(val, 0x0081ULL, 0xFF80ULL); - ibs_config.max_cnt_op = val << 4; - } else { - /* - * The start value is randomized with a - * positive offset, we need to compensate it - * with the half of the randomized range. Also - * avoid underflows. - */ - val += IBS_RANDOM_MAXCNT_OFFSET; - if (ibs_caps & IBS_CAPS_OPCNTEXT) - val = min(val, IBS_OP_MAX_CNT_EXT); - else - val = min(val, IBS_OP_MAX_CNT); - ibs_config.max_cnt_op = - (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; - } - val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); - val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; - val |= IBS_OP_ENABLE; - ibs_state.ibs_op_ctl = val; - ibs_state.sample_size = IBS_OP_SIZE; - if (ibs_config.branch_target) { - ibs_state.branch_target = 1; - ibs_state.sample_size++; - } - val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); - wrmsrl(MSR_AMD64_IBSOPCTL, val); - } -} - -static void op_amd_stop_ibs(void) -{ - if (!ibs_caps) - return; - - if (ibs_config.fetch_enabled) - /* clear max count and enable */ - wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - - if (ibs_config.op_enabled) - /* clear max count and enable */ - wrmsrl(MSR_AMD64_IBSOPCTL, 0); -} - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -static int op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < num_counters; i++) { - if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - goto fail; - } - /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_CORE) { - msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); - msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); - } else { - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - } - continue; - fail: - if (!counter_config[i].enabled) - continue; - op_x86_warn_reserved(i); - op_amd_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - -static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* setup reset_value */ - for (i = 0; i < OP_MAX_COUNTER; ++i) { - if (counter_config[i].enabled - && msrs->counters[op_x86_virt_to_phys(i)].addr) - reset_value[i] = counter_config[i].count; - else - reset_value[i] = 0; - } - - /* clear all counters */ - for (i = 0; i < num_counters; ++i) { - if (!msrs->controls[i].addr) - continue; - rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - op_x86_warn_in_use(i); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - /* - * avoid a false detection of ctr overflows in NMI - * handler - */ - wrmsrl(msrs->counters[i].addr, -1LL); - } - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); - - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - rdmsrl(msrs->counters[i].addr, val); - /* bit is clear if overflowed: */ - if (val & OP_CTR_OVERFLOW) - continue; - oprofile_add_sample(regs, virt); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); - } - - op_amd_handle_ibs(regs, msrs); - - /* See op_model_ppro.c */ - return 1; -} - -static void op_amd_start(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[op_x86_phys_to_virt(i)]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - - op_amd_start_ibs(); -} - -static void op_amd_stop(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* - * Subtle: stop on all counters to avoid race with setting our - * pm callback - */ - for (i = 0; i < num_counters; ++i) { - if (!reset_value[op_x86_phys_to_virt(i)]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - - op_amd_stop_ibs(); -} - -/* - * check and reserve APIC extended interrupt LVT offset for IBS if - * available - */ - -static void init_ibs(void) -{ - ibs_caps = get_ibs_caps(); - - if (!ibs_caps) - return; - - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); -} - -static int (*create_arch_files)(struct dentry *root); - -static int setup_ibs_files(struct dentry *root) -{ - struct dentry *dir; - int ret = 0; - - /* architecture specific files */ - if (create_arch_files) - ret = create_arch_files(root); - - if (ret) - return ret; - - if (!ibs_caps) - return ret; - - /* model specific files */ - - /* setup some reasonable defaults */ - memset(&ibs_config, 0, sizeof(ibs_config)); - ibs_config.max_cnt_fetch = 250000; - ibs_config.max_cnt_op = 250000; - - if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(root, "ibs_fetch"); - oprofilefs_create_ulong(dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(dir, "max_count", - &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(dir, "rand_enable", - &ibs_config.rand_en); - } - - if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(root, "ibs_op"); - oprofilefs_create_ulong(dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(dir, "max_count", - &ibs_config.max_cnt_op); - if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(dir, "dispatched_ops", - &ibs_config.dispatched_ops); - if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(dir, "branch_target", - &ibs_config.branch_target); - } - - return 0; -} - -struct op_x86_model_spec op_amd_spec; - -static int op_amd_init(struct oprofile_operations *ops) -{ - init_ibs(); - create_arch_files = ops->create_files; - ops->create_files = setup_ibs_files; - - if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_CORE; - } else { - num_counters = AMD64_NUM_COUNTERS; - } - - op_amd_spec.num_counters = num_counters; - op_amd_spec.num_controls = num_counters; - op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); - - return 0; -} - -struct op_x86_model_spec op_amd_spec = { - /* num_counters/num_controls filled in at runtime */ - .reserved = MSR_AMD_EVENTSEL_RESERVED, - .event_mask = OP_EVENT_MASK, - .init = op_amd_init, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown, -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - .switch_ctrl = &op_mux_switch_ctrl, -#endif -}; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c deleted file mode 100644 index ad1d91f475ab..000000000000 --- a/arch/x86/oprofile/op_model_p4.c +++ /dev/null @@ -1,723 +0,0 @@ -/** - * @file op_model_p4.c - * P4 model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author Graydon Hoare - */ - -#include <linux/oprofile.h> -#include <linux/smp.h> -#include <linux/ptrace.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/fixmap.h> -#include <asm/apic.h> - - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_EVENTS 39 - -#define NUM_COUNTERS_NON_HT 8 -#define NUM_ESCRS_NON_HT 45 -#define NUM_CCCRS_NON_HT 18 -#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) - -#define NUM_COUNTERS_HT2 4 -#define NUM_ESCRS_HT2 23 -#define NUM_CCCRS_HT2 9 -#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) - -#define OP_CTR_OVERFLOW (1ULL<<31) - -static unsigned int num_counters = NUM_COUNTERS_NON_HT; -static unsigned int num_controls = NUM_CONTROLS_NON_HT; - -/* this has to be checked dynamically since the - hyper-threadedness of a chip is discovered at - kernel boot-time. */ -static inline void setup_num_counters(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings == 2) { - num_counters = NUM_COUNTERS_HT2; - num_controls = NUM_CONTROLS_HT2; - } -#endif -} - -static inline int addr_increment(void) -{ -#ifdef CONFIG_SMP - return smp_num_siblings == 2 ? 2 : 1; -#else - return 1; -#endif -} - - -/* tables to simulate simplified hardware view of p4 registers */ -struct p4_counter_binding { - int virt_counter; - int counter_address; - int cccr_address; -}; - -struct p4_event_binding { - int escr_select; /* value to put in CCCR */ - int event_select; /* value to put in ESCR */ - struct { - int virt_counter; /* for this counter... */ - int escr_address; /* use this ESCR */ - } bindings[2]; -}; - -/* nb: these CTR_* defines are a duplicate of defines in - event/i386.p4*events. */ - - -#define CTR_BPU_0 (1 << 0) -#define CTR_MS_0 (1 << 1) -#define CTR_FLAME_0 (1 << 2) -#define CTR_IQ_4 (1 << 3) -#define CTR_BPU_2 (1 << 4) -#define CTR_MS_2 (1 << 5) -#define CTR_FLAME_2 (1 << 6) -#define CTR_IQ_5 (1 << 7) - -static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { - { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, - { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, - { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, - { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, - { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, - { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, - { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, - { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } -}; - -#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) - -/* p4 event codes in libop/op_event.h are indices into this table. */ - -static struct p4_event_binding p4_events[NUM_EVENTS] = { - - { /* BRANCH_RETIRED */ - 0x05, 0x06, - { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, - {CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* MISPRED_BRANCH_RETIRED */ - 0x04, 0x03, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* TC_DELIVER_MODE */ - 0x01, 0x01, - { { CTR_MS_0, MSR_P4_TC_ESCR0}, - { CTR_MS_2, MSR_P4_TC_ESCR1} } - }, - - { /* BPU_FETCH_REQUEST */ - 0x00, 0x03, - { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, - { CTR_BPU_2, MSR_P4_BPU_ESCR1} } - }, - - { /* ITLB_REFERENCE */ - 0x03, 0x18, - { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, - { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } - }, - - { /* MEMORY_CANCEL */ - 0x05, 0x02, - { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, - { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } - }, - - { /* MEMORY_COMPLETE */ - 0x02, 0x08, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* LOAD_PORT_REPLAY */ - 0x02, 0x04, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* STORE_PORT_REPLAY */ - 0x02, 0x05, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* MOB_LOAD_REPLAY */ - 0x02, 0x03, - { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, - { CTR_BPU_2, MSR_P4_MOB_ESCR1} } - }, - - { /* PAGE_WALK_TYPE */ - 0x04, 0x01, - { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, - { CTR_BPU_2, MSR_P4_PMH_ESCR1} } - }, - - { /* BSQ_CACHE_REFERENCE */ - 0x07, 0x0c, - { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, - { CTR_BPU_2, MSR_P4_BSU_ESCR1} } - }, - - { /* IOQ_ALLOCATION */ - 0x06, 0x03, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { 0, 0 } } - }, - - { /* IOQ_ACTIVE_ENTRIES */ - 0x06, 0x1a, - { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, - { 0, 0 } } - }, - - { /* FSB_DATA_ACTIVITY */ - 0x06, 0x17, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { CTR_BPU_2, MSR_P4_FSB_ESCR1} } - }, - - { /* BSQ_ALLOCATION */ - 0x07, 0x05, - { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, - { 0, 0 } } - }, - - { /* BSQ_ACTIVE_ENTRIES */ - 0x07, 0x06, - { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, - { 0, 0 } } - }, - - { /* X87_ASSIST */ - 0x05, 0x03, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* SSE_INPUT_ASSIST */ - 0x01, 0x34, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* PACKED_SP_UOP */ - 0x01, 0x08, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* PACKED_DP_UOP */ - 0x01, 0x0c, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* SCALAR_SP_UOP */ - 0x01, 0x0a, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* SCALAR_DP_UOP */ - 0x01, 0x0e, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* 64BIT_MMX_UOP */ - 0x01, 0x02, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* 128BIT_MMX_UOP */ - 0x01, 0x1a, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* X87_FP_UOP */ - 0x01, 0x04, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* X87_SIMD_MOVES_UOP */ - 0x01, 0x2e, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* MACHINE_CLEAR */ - 0x05, 0x02, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* GLOBAL_POWER_EVENTS */ - 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { CTR_BPU_2, MSR_P4_FSB_ESCR1} } - }, - - { /* TC_MS_XFER */ - 0x00, 0x05, - { { CTR_MS_0, MSR_P4_MS_ESCR0}, - { CTR_MS_2, MSR_P4_MS_ESCR1} } - }, - - { /* UOP_QUEUE_WRITES */ - 0x00, 0x09, - { { CTR_MS_0, MSR_P4_MS_ESCR0}, - { CTR_MS_2, MSR_P4_MS_ESCR1} } - }, - - { /* FRONT_END_EVENT */ - 0x05, 0x08, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* EXECUTION_EVENT */ - 0x05, 0x0c, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* REPLAY_EVENT */ - 0x05, 0x09, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* INSTR_RETIRED */ - 0x04, 0x02, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* UOPS_RETIRED */ - 0x04, 0x01, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* UOP_TYPE */ - 0x02, 0x02, - { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, - { CTR_IQ_5, MSR_P4_RAT_ESCR1} } - }, - - { /* RETIRED_MISPRED_BRANCH_TYPE */ - 0x02, 0x05, - { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, - { CTR_MS_2, MSR_P4_TBPU_ESCR1} } - }, - - { /* RETIRED_BRANCH_TYPE */ - 0x02, 0x04, - { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, - { CTR_MS_2, MSR_P4_TBPU_ESCR1} } - } -}; - - -#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) - -#define ESCR_RESERVED_BITS 0x80000003 -#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) -#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) -#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) -#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) -#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) -#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) -#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) - -#define CCCR_RESERVED_BITS 0x38030FFF -#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) -#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) -#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) -#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) -#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) -#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) -#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) -#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) - - -/* this assigns a "stagger" to the current CPU, which is used throughout - the code in this module as an extra array offset, to select the "even" - or "odd" part of all the divided resources. */ -static unsigned int get_stagger(void) -{ -#ifdef CONFIG_SMP - int cpu = smp_processor_id(); - return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); -#endif - return 0; -} - - -/* finally, mediate access to a real hardware counter - by passing a "virtual" counter numer to this macro, - along with your stagger setting. */ -#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) - -static unsigned long reset_value[NUM_COUNTERS_NON_HT]; - -static void p4_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(msrs->counters[i].addr); - } - /* - * some of the control registers are specially reserved in - * conjunction with the counter registers (hence the starting offset). - * This saves a few bits. - */ - for (i = num_counters; i < num_controls; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(msrs->controls[i].addr); - } -} - -static int p4_fill_in_addresses(struct op_msrs * const msrs) -{ - unsigned int i; - unsigned int addr, cccraddr, stag; - - setup_num_counters(); - stag = get_stagger(); - - /* the counter & cccr registers we pay attention to */ - for (i = 0; i < num_counters; ++i) { - addr = p4_counters[VIRT_CTR(stag, i)].counter_address; - cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; - if (reserve_perfctr_nmi(addr)) { - msrs->counters[i].addr = addr; - msrs->controls[i].addr = cccraddr; - } - } - - /* 43 ESCR registers in three or four discontiguous group */ - for (addr = MSR_P4_BSU_ESCR0 + stag; - addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 - * to avoid special case in nmi_{save|restore}_registers() */ - if (boot_cpu_data.x86_model >= 0x3) { - for (addr = MSR_P4_BSU_ESCR0 + stag; - addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - } else { - for (addr = MSR_P4_IQ_ESCR0 + stag; - addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - } - - for (addr = MSR_P4_RAT_ESCR0 + stag; - addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - for (addr = MSR_P4_MS_ESCR0 + stag; - addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - for (addr = MSR_P4_IX_ESCR0 + stag; - addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - /* there are 2 remaining non-contiguously located ESCRs */ - - if (num_counters == NUM_COUNTERS_NON_HT) { - /* standard non-HT CPUs handle both remaining ESCRs*/ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; - - } else if (stag == 0) { - /* HT CPUs give the first remainder to the even thread, as - the 32nd control register */ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; - - } else { - /* and two copies of the second to the odd thread, - for the 22st and 23nd control registers */ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) { - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - } - } - - for (i = 0; i < num_counters; ++i) { - if (!counter_config[i].enabled) - continue; - if (msrs->controls[i].addr) - continue; - op_x86_warn_reserved(i); - p4_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - - -static void pmc_setup_one_p4_counter(unsigned int ctr) -{ - int i; - int const maxbind = 2; - unsigned int cccr = 0; - unsigned int escr = 0; - unsigned int high = 0; - unsigned int counter_bit; - struct p4_event_binding *ev = NULL; - unsigned int stag; - - stag = get_stagger(); - - /* convert from counter *number* to counter *bit* */ - counter_bit = 1 << VIRT_CTR(stag, ctr); - - /* find our event binding structure. */ - if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { - printk(KERN_ERR - "oprofile: P4 event code 0x%lx out of range\n", - counter_config[ctr].event); - return; - } - - ev = &(p4_events[counter_config[ctr].event - 1]); - - for (i = 0; i < maxbind; i++) { - if (ev->bindings[i].virt_counter & counter_bit) { - - /* modify ESCR */ - rdmsr(ev->bindings[i].escr_address, escr, high); - ESCR_CLEAR(escr); - if (stag == 0) { - ESCR_SET_USR_0(escr, counter_config[ctr].user); - ESCR_SET_OS_0(escr, counter_config[ctr].kernel); - } else { - ESCR_SET_USR_1(escr, counter_config[ctr].user); - ESCR_SET_OS_1(escr, counter_config[ctr].kernel); - } - ESCR_SET_EVENT_SELECT(escr, ev->event_select); - ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - wrmsr(ev->bindings[i].escr_address, escr, high); - - /* modify CCCR */ - rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, - cccr, high); - CCCR_CLEAR(cccr); - CCCR_SET_REQUIRED_BITS(cccr); - CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); - if (stag == 0) - CCCR_SET_PMI_OVF_0(cccr); - else - CCCR_SET_PMI_OVF_1(cccr); - wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, - cccr, high); - return; - } - } - - printk(KERN_ERR - "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", - counter_config[ctr].event, stag, ctr); -} - - -static void p4_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - unsigned int i; - unsigned int low, high; - unsigned int stag; - - stag = get_stagger(); - - rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (!MISC_PMC_ENABLED_P(low)) { - printk(KERN_ERR "oprofile: P4 PMC not available\n"); - return; - } - - /* clear the cccrs we will use */ - for (i = 0; i < num_counters; i++) { - if (unlikely(!msrs->controls[i].addr)) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_CLEAR(low); - CCCR_SET_REQUIRED_BITS(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } - - /* clear all escrs (including those outside our concern) */ - for (i = num_counters; i < num_controls; i++) { - if (unlikely(!msrs->controls[i].addr)) - continue; - wrmsr(msrs->controls[i].addr, 0, 0); - } - - /* setup all counters */ - for (i = 0; i < num_counters; ++i) { - if (counter_config[i].enabled && msrs->controls[i].addr) { - reset_value[i] = counter_config[i].count; - pmc_setup_one_p4_counter(i); - wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(u64)counter_config[i].count); - } else { - reset_value[i] = 0; - } - } -} - - -static int p4_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned long ctr, low, high, stag, real; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - - if (!reset_value[i]) - continue; - - /* - * there is some eccentricity in the hardware which - * requires that we perform 2 extra corrections: - * - * - check both the CCCR:OVF flag for overflow and the - * counter high bit for un-flagged overflows. - * - * - write the counter back twice to ensure it gets - * updated properly. - * - * the former seems to be related to extra NMIs happening - * during the current NMI; the latter is reported as errata - * N15 in intel doc 249199-029, pentium 4 specification - * update, though their suggested work-around does not - * appear to solve the problem. - */ - - real = VIRT_CTR(stag, i); - - rdmsr(p4_counters[real].cccr_address, low, high); - rdmsr(p4_counters[real].counter_address, ctr, high); - if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { - oprofile_add_sample(regs, i); - wrmsrl(p4_counters[real].counter_address, - -(u64)reset_value[i]); - CCCR_CLEAR_OVF(low); - wrmsr(p4_counters[real].cccr_address, low, high); - wrmsrl(p4_counters[real].counter_address, - -(u64)reset_value[i]); - } - } - - /* P4 quirk: you have to re-unmask the apic vector */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); - - /* See op_model_ppro.c */ - return 1; -} - - -static void p4_start(struct op_msrs const * const msrs) -{ - unsigned int low, high, stag; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_SET_ENABLE(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } -} - - -static void p4_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high, stag; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_SET_DISABLE(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } -} - -#ifdef CONFIG_SMP -struct op_x86_model_spec op_p4_ht2_spec = { - .num_counters = NUM_COUNTERS_HT2, - .num_controls = NUM_CONTROLS_HT2, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown -}; -#endif - -struct op_x86_model_spec op_p4_spec = { - .num_counters = NUM_COUNTERS_NON_HT, - .num_controls = NUM_CONTROLS_NON_HT, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown -}; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c deleted file mode 100644 index 7913b6921959..000000000000 --- a/arch/x86/oprofile/op_model_ppro.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * @file op_model_ppro.h - * Family 6 perfmon and architectural perfmon MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Copyright 2008 Intel Corporation - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Andi Kleen - * @author Robert Richter <robert.richter@amd.com> - */ - -#include <linux/oprofile.h> -#include <linux/slab.h> -#include <asm/ptrace.h> -#include <asm/msr.h> -#include <asm/apic.h> -#include <asm/nmi.h> - -#include "op_x86_model.h" -#include "op_counter.h" - -static int num_counters = 2; -static int counter_width = 32; - -#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) - -static u64 reset_value[OP_MAX_COUNTER]; - -static void ppro_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); - } -} - -static int ppro_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < num_counters; i++) { - if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) - goto fail; - if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - goto fail; - } - /* both registers must be reserved */ - msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - continue; - fail: - if (!counter_config[i].enabled) - continue; - op_x86_warn_reserved(i); - ppro_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - - -static void ppro_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) { - union cpuid10_eax eax; - eax.full = cpuid_eax(0xa); - - /* - * For Core2 (family 6, model 15), don't reset the - * counter width: - */ - if (!(eax.split.version_id == 0 && - __this_cpu_read(cpu_info.x86) == 6 && - __this_cpu_read(cpu_info.x86_model) == 15)) { - - if (counter_width < eax.split.bit_width) - counter_width = eax.split.bit_width; - } - } - - /* clear all counters */ - for (i = 0; i < num_counters; ++i) { - if (!msrs->controls[i].addr) - continue; - rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - op_x86_warn_in_use(i); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - /* - * avoid a false detection of ctr overflows in NMI * - * handler - */ - wrmsrl(msrs->counters[i].addr, -1LL); - } - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - if (counter_config[i].enabled && msrs->counters[i].addr) { - reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[i]); - wrmsrl(msrs->controls[i].addr, val); - } else { - reset_value[i] = 0; - } - } -} - - -static int ppro_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsrl(msrs->counters[i].addr, val); - if (val & (1ULL << (counter_width - 1))) - continue; - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } - - /* Only P6 based Pentium M need to re-unmask the apic vector but it - * doesn't hurt other P6 variant */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); - - /* We can't work out if we really handled an interrupt. We - * might have caught a *second* counter just after overflowing - * the interrupt for this counter then arrives - * and we don't find a counter that's overflowed, so we - * would return 0 and get dazed + confused. Instead we always - * assume we found an overflow. This sucks. - */ - return 1; -} - - -static void ppro_start(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (reset_value[i]) { - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - } -} - - -static void ppro_stop(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } -} - -struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, - .num_controls = 2, - .reserved = MSR_PPRO_EVENTSEL_RESERVED, - .fill_in_addresses = &ppro_fill_in_addresses, - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown -}; - -/* - * Architectural performance monitoring. - * - * Newer Intel CPUs (Core1+) have support for architectural - * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. - * The advantage of this is that it can be done without knowing about - * the specific CPU. - */ - -static void arch_perfmon_setup_counters(void) -{ - union cpuid10_eax eax; - - eax.full = cpuid_eax(0xa); - - /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ - if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 15) { - eax.split.version_id = 2; - eax.split.num_counters = 2; - eax.split.bit_width = 40; - } - - num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER); - - op_arch_perfmon_spec.num_counters = num_counters; - op_arch_perfmon_spec.num_controls = num_counters; -} - -static int arch_perfmon_init(struct oprofile_operations *ignore) -{ - arch_perfmon_setup_counters(); - return 0; -} - -struct op_x86_model_spec op_arch_perfmon_spec = { - .reserved = MSR_PPRO_EVENTSEL_RESERVED, - .init = &arch_perfmon_init, - /* num_counters/num_controls filled in at runtime */ - .fill_in_addresses = &ppro_fill_in_addresses, - /* user space does the cpuid check for available events */ - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown -}; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h deleted file mode 100644 index 276cf79b5d24..000000000000 --- a/arch/x86/oprofile/op_x86_model.h +++ /dev/null @@ -1,90 +0,0 @@ -/** - * @file op_x86_model.h - * interface to x86 model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author Graydon Hoare - * @author Robert Richter <robert.richter@amd.com> - */ - -#ifndef OP_X86_MODEL_H -#define OP_X86_MODEL_H - -#include <asm/types.h> -#include <asm/perf_event.h> - -struct op_msr { - unsigned long addr; - u64 saved; -}; - -struct op_msrs { - struct op_msr *counters; - struct op_msr *controls; - struct op_msr *multiplex; -}; - -struct pt_regs; - -struct oprofile_operations; - -/* The model vtable abstracts the differences between - * various x86 CPU models' perfctr support. - */ -struct op_x86_model_spec { - unsigned int num_counters; - unsigned int num_controls; - unsigned int num_virt_counters; - u64 reserved; - u16 event_mask; - int (*init)(struct oprofile_operations *ops); - int (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - void (*switch_ctrl)(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs); -#endif -}; - -struct op_counter_config; - -static inline void op_x86_warn_in_use(int counter) -{ - /* - * The warning indicates an already running counter. If - * oprofile doesn't collect data, then try using a different - * performance counter on your platform to monitor the desired - * event. Delete counter #%d from the desired event by editing - * the /usr/share/oprofile/%s/<cpu>/events file. If the event - * cannot be monitored by any other counter, contact your - * hardware or BIOS vendor. - */ - pr_warn("oprofile: counter #%d on cpu #%d may already be used\n", - counter, smp_processor_id()); -} - -static inline void op_x86_warn_reserved(int counter) -{ - pr_warn("oprofile: counter #%d is already reserved\n", counter); -} - -extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, - struct op_counter_config *counter_config); -extern int op_x86_phys_to_virt(int phys); -extern int op_x86_virt_to_phys(int virt); - -extern struct op_x86_model_spec op_ppro_spec; -extern struct op_x86_model_spec op_p4_spec; -extern struct op_x86_model_spec op_p4_ht2_spec; -extern struct op_x86_model_spec op_amd_spec; -extern struct op_x86_model_spec op_arch_perfmon_spec; - -#endif /* OP_X86_MODEL_H */ diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index fa855bbaebaf..f2f4a5d50b27 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -366,9 +366,9 @@ static int __init pcibios_assign_resources(void) return 0; } -/** - * called in fs_initcall (one below subsys_initcall), - * give a chance for motherboard reserve resources +/* + * This is an fs_initcall (one below subsys_initcall) in order to reserve + * resources properly. */ fs_initcall(pcibios_assign_resources); diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 00bfa1ebad6c..0bb3b8b44e4e 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -9,16 +9,23 @@ in the right sequence from here. */ static __init int pci_arch_init(void) { - int type; - - x86_create_pci_msi_domain(); + int type, pcbios = 1; type = pci_direct_probe(); if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); - if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + if (x86_init.pci.arch_init) + pcbios = x86_init.pci.arch_init(); + + /* + * Must happen after x86_init.pci.arch_init(). Xen sets up the + * x86_init.irqs.create_pci_msi_domain there. + */ + x86_create_pci_msi_domain(); + + if (!pcbios) return 0; pci_pcbios_init(); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 24ca4ee2802f..8edd62206604 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -28,10 +28,12 @@ #include <linux/io.h> #include <linux/smp.h> +#include <asm/cpu_device_id.h> #include <asm/segment.h> #include <asm/pci_x86.h> #include <asm/hw_irq.h> #include <asm/io_apic.h> +#include <asm/intel-family.h> #include <asm/intel-mid.h> #include <asm/acpi.h> @@ -140,6 +142,7 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, * type1_access_ok - check whether to use type 1 * @bus: bus number * @devfn: device & function in question + * @reg: configuration register offset * * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at * all, the we can go ahead with any reads & writes. If it's on a Lincroft, @@ -212,10 +215,17 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, where, size, value); } +static const struct x86_cpu_id intel_mid_cpu_ids[] = { + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, NULL), + {} +}; + static int intel_mid_pci_irq_enable(struct pci_dev *dev) { + const struct x86_cpu_id *id; struct irq_alloc_info info; - int polarity; + bool polarity_low; + u16 model = 0; int ret; u8 gsi; @@ -228,9 +238,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) return ret; } - switch (intel_mid_identify_cpu()) { - case INTEL_MID_CPU_CHIP_TANGIER: - polarity = IOAPIC_POL_HIGH; + id = x86_match_cpu(intel_mid_cpu_ids); + if (id) + model = id->model; + + switch (model) { + case INTEL_FAM6_ATOM_SILVERMONT_MID: + polarity_low = false; /* Special treatment for IRQ0 */ if (gsi == 0) { @@ -252,11 +266,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) } break; default: - polarity = IOAPIC_POL_LOW; + polarity_low = true; break; } - ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 6fa42e9c4e6f..de6bf0e7e8f8 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -11,9 +11,9 @@ * themselves. */ +#include <linux/acpi.h> #include <linux/pci.h> #include <linux/init.h> -#include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/dmi.h> #include <linux/slab.h> @@ -425,7 +425,7 @@ static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used) { struct resource mcfg_res; @@ -442,7 +442,7 @@ static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used) return mcfg_res.flags; } -typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type); +typedef bool (*check_reserved_t)(u64 start, u64 end, enum e820_type type); static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, struct pci_mmcfg_region *cfg, @@ -665,7 +665,7 @@ void __init pci_mmcfg_early_init(void) if (pci_mmcfg_check_hostbridge()) known_bridge = 1; else - acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); __pci_mmcfg_init(1); set_apei_filter(); @@ -683,7 +683,7 @@ void __init pci_mmcfg_late_init(void) /* MMCONFIG hasn't been enabled yet, try again */ if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) { - acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); __pci_mmcfg_init(0); } } diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 5701d5ba3df4..7d2525691854 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -11,7 +11,8 @@ #include <linux/pci_ids.h> #include <linux/export.h> #include <linux/list.h> -#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> +#include <linux/swiotlb.h> #include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index c552cd2d0632..3d41a09c2c14 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -152,7 +152,6 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, #if defined(CONFIG_PCI_MSI) #include <linux/msi.h> -#include <asm/msidef.h> struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); @@ -210,23 +209,20 @@ free: return ret; } -#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ - MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) - static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, struct msi_msg *msg) { - /* We set vector == 0 to tell the hypervisor we don't care about it, - * but we want a pirq setup instead. - * We use the dest_id field to pass the pirq that we want. */ - msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq); - msg->address_lo = - MSI_ADDR_BASE_LO | - MSI_ADDR_DEST_MODE_PHYSICAL | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(pirq); - - msg->data = XEN_PIRQ_MSI_DATA; + /* + * We set vector == 0 to tell the hypervisor we don't care about + * it, but we want a pirq setup instead. We use the dest_id fields + * to pass the pirq that we want. + */ + memset(msg, 0, sizeof(*msg)); + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + msg->arch_addr_hi.destid_8_31 = pirq >> 8; + msg->arch_addr_lo.destid_0_7 = pirq & 0xFF; + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_EXTINT; } static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index d0e835470d01..3ed03a2552d0 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -4,13 +4,11 @@ obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ -obj-y += goldfish/ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ obj-y += olpc/ obj-y += scx200/ -obj-y += sfi/ obj-y += ts5500/ obj-y += uv/ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e1e8d4e3a213..1b82d77019b1 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -54,10 +54,7 @@ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ static u64 efi_va = EFI_VA_START; - -struct efi_scratch efi_scratch; - -EXPORT_SYMBOL_GPL(efi_mm); +static struct mm_struct *efi_prev_mm; /* * We need our own copy of the higher levels of the page tables @@ -115,31 +112,12 @@ void efi_sync_low_kernel_mappings(void) pud_t *pud_k, *pud_efi; pgd_t *efi_pgd = efi_mm.pgd; - /* - * We can share all PGD entries apart from the one entry that - * covers the EFI runtime mapping space. - * - * Make sure the EFI runtime region mappings are guaranteed to - * only span a single PGD entry and that the entry also maps - * other important kernel regions. - */ - MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != - (EFI_VA_END & PGDIR_MASK)); - pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); pgd_k = pgd_offset_k(PAGE_OFFSET); num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET); memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); - /* - * As with PGDs, we share all P4D entries apart from the one entry - * that covers the EFI runtime mapping space. - */ - BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); - pgd_efi = efi_pgd + pgd_index(EFI_VA_END); pgd_k = pgd_offset_k(EFI_VA_END); p4d_efi = p4d_offset(pgd_efi, 0); @@ -256,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } - efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */ + efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */ npages = (_etext - _text) >> PAGE_SHIFT; text = __pa(_text); @@ -481,11 +459,17 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurent calls to this function. */ -void efi_switch_mm(struct mm_struct *mm) +void efi_enter_mm(void) +{ + efi_prev_mm = current->active_mm; + current->active_mm = &efi_mm; + switch_mm(efi_prev_mm, &efi_mm, NULL); +} + +void efi_leave_mm(void) { - efi_scratch.prev_mm = current->active_mm; - current->active_mm = mm; - switch_mm(efi_scratch.prev_mm, mm, NULL); + current->active_mm = efi_prev_mm; + switch_mm(&efi_mm, efi_prev_mm, NULL); } static DEFINE_SPINLOCK(efi_runtime_lock); @@ -549,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size, efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_switch_mm(&efi_mm); + efi_enter_mm(); status = __efi_thunk(set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); local_irq_restore(flags); return status; @@ -848,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(&efi_mm); + efi_enter_mm(); - kernel_fpu_begin(); + efi_fpu_begin(); /* Disable interrupts around EFI calls: */ local_irq_save(flags); @@ -859,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_version, virtual_map); local_irq_restore(flags); - kernel_fpu_end(); + efi_fpu_end(); /* grab the virtually remapped EFI runtime services table pointer */ efi.runtime = READ_ONCE(systab->runtime); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); return status; } diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 26f0da238c1c..fd3dd1708eba 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk) * Switch to 1:1 mapped 32-bit stack pointer. */ movq %rsp, %rax - movq efi_scratch(%rip), %rsp + movq efi_mixed_mode_stack_pa(%rip), %rsp push %rax /* @@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk) pushl %ebp lret SYM_CODE_END(__efi64_thunk) + + .bss + .balign 8 +SYM_DATA(efi_mixed_mode_stack_pa, .quad 0) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5a40fe411ebd..67d93a243c35 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_recover_from_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) { if (!IS_ENABLED(CONFIG_X86_64)) return; /* + * If we get an interrupt/NMI while processing an EFI runtime service + * then this is a regular OOPS, not an EFI failure. + */ + if (in_interrupt()) + return; + + /* * Make sure that an efi runtime service caused the page fault. + * READ_ONCE() because we might be OOPSing in a different thread, + * and we don't want to trip KTSAN while trying to OOPS. */ - if (efi_rts_work.efi_rts_id == EFI_NONE) + if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE || + current_work() != &efi_rts_work.work) return; /* @@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr) set_current_state(TASK_IDLE); schedule(); } - - return; } diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index c33f744b5388..b39bf3b5e108 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -22,6 +22,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = { static struct gpio_led alix_leds[] = { { .name = "alix:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "alix:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "alix:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = { .leds = alix_leds, }; +static struct gpiod_lookup_table alix_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device alix_leds_dev = { .name = "leds-gpio", .id = -1, @@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = { static void __init register_alix(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&alix_leds_gpio_table); platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs)); } diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index 73a3f49b4eb6..d263528c90bb 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = { static struct gpio_led geos_leds[] = { { .name = "geos:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "geos:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "geos:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = { .leds = geos_leds, }; +static struct gpiod_lookup_table geos_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device geos_leds_dev = { .name = "leds-gpio", .id = -1, @@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = { static void __init register_geos(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&geos_leds_gpio_table); platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs)); } diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 163e1b545517..558384acd777 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <asm/geode.h> @@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = { static struct gpio_led net5501_leds[] = { { .name = "net5501:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 0, }, }; @@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = { .leds = net5501_leds, }; +static struct gpiod_lookup_table net5501_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH), + { } + }, +}; + static struct platform_device net5501_leds_dev = { .name = "leds-gpio", .id = -1, @@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = { static void __init register_net5501(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&net5501_leds_gpio_table); platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs)); } diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile deleted file mode 100644 index 072c395379ac..000000000000 --- a/arch/x86/platform/goldfish/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GOLDFISH) += goldfish.o diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c deleted file mode 100644 index 6b6f8b4360dd..000000000000 --- a/arch/x86/platform/goldfish/goldfish.c +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2007 Google, Inc. - * Copyright (C) 2011 Intel, Inc. - * Copyright (C) 2013 Intel, Inc. - */ - -#include <linux/kernel.h> -#include <linux/irq.h> -#include <linux/platform_device.h> - -/* - * Where in virtual device memory the IO devices (timers, system controllers - * and so on) - */ - -#define GOLDFISH_PDEV_BUS_BASE (0xff001000) -#define GOLDFISH_PDEV_BUS_END (0xff7fffff) -#define GOLDFISH_PDEV_BUS_IRQ (4) - -#define GOLDFISH_TTY_BASE (0x2000) - -static struct resource goldfish_pdev_bus_resources[] = { - { - .start = GOLDFISH_PDEV_BUS_BASE, - .end = GOLDFISH_PDEV_BUS_END, - .flags = IORESOURCE_MEM, - }, - { - .start = GOLDFISH_PDEV_BUS_IRQ, - .end = GOLDFISH_PDEV_BUS_IRQ, - .flags = IORESOURCE_IRQ, - } -}; - -static bool goldfish_enable __initdata; - -static int __init goldfish_setup(char *str) -{ - goldfish_enable = true; - return 0; -} -__setup("goldfish", goldfish_setup); - -static int __init goldfish_init(void) -{ - if (!goldfish_enable) - return -ENODEV; - - platform_device_register_simple("goldfish_pdev_bus", -1, - goldfish_pdev_bus_resources, 2); - return 0; -} -device_initcall(goldfish_init); diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index cc2549f0ccb1..ddfc08783fb8 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,7 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o - -# SFI specific code -ifdef CONFIG_X86_INTEL_MID -obj-$(CONFIG_SFI) += sfi.o device_libs/ -endif +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o pwr.o diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile deleted file mode 100644 index 480fed21cc7d..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Family-Level Interface Shim (FLIS) -obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o -# SDHCI Devices -obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o -# WiFi + BT -obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o -obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o -# IPC Devices -obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o -obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o -obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o -obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o -obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o -obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o -obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o -# SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o -# I2C Devices -obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o -obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o -obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o -obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o -obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o -# I2C GPIO Expanders -obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o -obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o -obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o -# MISC Devices -obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o -obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o -obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o -obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c deleted file mode 100644 index 564c47c53f3a..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_bcm43xx.c: bcm43xx platform data initialization file - * - * (C) Copyright 2016 Intel Corporation - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/gpio/machine.h> -#include <linux/platform_device.h> -#include <linux/regulator/machine.h> -#include <linux/regulator/fixed.h> -#include <linux/sfi.h> - -#include <asm/intel-mid.h> - -#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt" -#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable" - -#define WLAN_DEV_NAME "0000:00:01.3" - -static struct regulator_consumer_supply bcm43xx_vmmc_supply = { - .dev_name = WLAN_DEV_NAME, - .supply = "vmmc", -}; - -static struct regulator_init_data bcm43xx_vmmc_data = { - .constraints = { - .valid_ops_mask = REGULATOR_CHANGE_STATUS, - }, - .num_consumer_supplies = 1, - .consumer_supplies = &bcm43xx_vmmc_supply, -}; - -static struct fixed_voltage_config bcm43xx_vmmc = { - .supply_name = "bcm43xx-vmmc-regulator", - /* - * Announce 2.0V here to be compatible with SDIO specification. The - * real voltage and signaling are still 1.8V. - */ - .microvolts = 2000000, /* 1.8V */ - .startup_delay = 250 * 1000, /* 250ms */ - .enabled_at_boot = 0, /* disabled at boot */ - .init_data = &bcm43xx_vmmc_data, -}; - -static struct platform_device bcm43xx_vmmc_regulator = { - .name = "reg-fixed-voltage", - .id = PLATFORM_DEVID_AUTO, - .dev = { - .platform_data = &bcm43xx_vmmc, - }, -}; - -static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = { - .dev_id = "reg-fixed-voltage.0", - .table = { - GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW), - {} - }, -}; - -static int __init bcm43xx_regulator_register(void) -{ - struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table; - struct gpiod_lookup *lookup = table->table; - int ret; - - lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); - gpiod_add_lookup_table(table); - - ret = platform_device_register(&bcm43xx_vmmc_regulator); - if (ret) { - pr_err("%s: vmmc regulator register failed\n", __func__); - return ret; - } - - return 0; -} - -static void __init *bcm43xx_platform_data(void *info) -{ - int ret; - - ret = bcm43xx_regulator_register(); - if (ret) - return NULL; - - pr_info("Using generic wifi platform data\n"); - - /* For now it's empty */ - return NULL; -} - -static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = { - .name = "bcm43xx_clk_vmmc", - .type = SFI_DEV_TYPE_SD, - .get_platform_data = &bcm43xx_platform_data, -}; - -sfi_device(bcm43xx_clk_vmmc_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c deleted file mode 100644 index 32912a17f68e..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_bma023.c: bma023 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - */ - -#include <asm/intel-mid.h> - -static const struct devs_id bma023_dev_id __initconst = { - .name = "bma023", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, -}; - -sfi_device(bma023_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c deleted file mode 100644 index 31dda18bb370..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Bluetooth platform data initialization file - * - * (C) Copyright 2017 Intel Corporation - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/gpio/machine.h> -#include <linux/pci.h> -#include <linux/platform_device.h> - -#include <asm/cpu_device_id.h> -#include <asm/intel-family.h> -#include <asm/intel-mid.h> - -struct bt_sfi_data { - struct device *dev; - const char *name; - int (*setup)(struct bt_sfi_data *ddata); -}; - -static struct gpiod_lookup_table tng_bt_sfi_gpio_table = { - .dev_id = "hci_bcm", - .table = { - GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH), - { }, - }, -}; - -#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup" -#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset" -#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable" - -static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) -{ - struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table; - struct gpiod_lookup *lookup = table->table; - struct pci_dev *pdev; - - /* Connected to /dev/ttyS0 */ - pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1)); - if (!pdev) - return -ENODEV; - - ddata->dev = &pdev->dev; - ddata->name = table->dev_id; - - lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP); - lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN); - lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP); - - gpiod_add_lookup_table(table); - return 0; -} - -static struct bt_sfi_data tng_bt_sfi_data __initdata = { - .setup = tng_bt_sfi_setup, -}; - -static const struct x86_cpu_id bt_sfi_cpu_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &tng_bt_sfi_data), - {} -}; - -static int __init bt_sfi_init(void) -{ - struct platform_device_info info; - struct platform_device *pdev; - const struct x86_cpu_id *id; - struct bt_sfi_data *ddata; - int ret; - - id = x86_match_cpu(bt_sfi_cpu_ids); - if (!id) - return -ENODEV; - - ddata = (struct bt_sfi_data *)id->driver_data; - if (!ddata) - return -ENODEV; - - ret = ddata->setup(ddata); - if (ret) - return ret; - - memset(&info, 0, sizeof(info)); - info.fwnode = ddata->dev->fwnode; - info.parent = ddata->dev; - info.name = ddata->name, - info.id = PLATFORM_DEVID_NONE, - - pdev = platform_device_register_full(&info); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name); - return 0; -} -device_initcall(bt_sfi_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c deleted file mode 100644 index a2508582a0b1..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_emc1403.c: emc1403 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/init.h> -#include <linux/gpio.h> -#include <linux/i2c.h> -#include <asm/intel-mid.h> - -static void __init *emc1403_platform_data(void *info) -{ - static short intr2nd_pdata; - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("thermal_int"); - int intr2nd = get_gpio_by_name("thermal_alert"); - - if (intr < 0) - return NULL; - if (intr2nd < 0) - return NULL; - - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET; - - return &intr2nd_pdata; -} - -static const struct devs_id emc1403_dev_id __initconst = { - .name = "emc1403", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &emc1403_platform_data, -}; - -sfi_device(emc1403_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c deleted file mode 100644 index d9435d2196a4..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_gpio_keys.c: gpio_keys platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/input.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/gpio.h> -#include <linux/gpio_keys.h> -#include <linux/platform_device.h> -#include <asm/intel-mid.h> - -#define DEVICE_NAME "gpio-keys" - -/* - * we will search these buttons in SFI GPIO table (by name) - * and register them dynamically. Please add all possible - * buttons here, we will shrink them if no GPIO found. - */ -static struct gpio_keys_button gpio_button[] = { - {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000}, - {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20}, - {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20}, - {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, - {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, - {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, - {KEY_MUTE, -1, 1, "mute_enable", EV_KEY, 0, 20}, - {KEY_VOLUMEUP, -1, 1, "volume_up", EV_KEY, 0, 20}, - {KEY_VOLUMEDOWN, -1, 1, "volume_down", EV_KEY, 0, 20}, - {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, - {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, - {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, - {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20}, -}; - -static struct gpio_keys_platform_data gpio_keys = { - .buttons = gpio_button, - .rep = 1, - .nbuttons = -1, /* will fill it after search */ -}; - -static struct platform_device pb_device = { - .name = DEVICE_NAME, - .id = -1, - .dev = { - .platform_data = &gpio_keys, - }, -}; - -/* - * Shrink the non-existent buttons, register the gpio button - * device if there is some - */ -static int __init pb_keys_init(void) -{ - struct gpio_keys_button *gb = gpio_button; - int i, good = 0; - - for (i = 0; i < ARRAY_SIZE(gpio_button); i++) { - gb[i].gpio = get_gpio_by_name(gb[i].desc); - pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, - gb[i].gpio); - if (gb[i].gpio < 0) - continue; - - if (i != good) - gb[good] = gb[i]; - good++; - } - - if (good) { - gpio_keys.nbuttons = good; - return platform_device_register(&pb_device); - } - return 0; -} -late_initcall(pb_keys_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c deleted file mode 100644 index a4485cd638c6..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_lis331.c: lis331 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/i2c.h> -#include <linux/gpio.h> -#include <asm/intel-mid.h> - -static void __init *lis331dl_platform_data(void *info) -{ - static short intr2nd_pdata; - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("accel_int"); - int intr2nd = get_gpio_by_name("accel_2"); - - if (intr < 0) - return NULL; - if (intr2nd < 0) - return NULL; - - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET; - - return &intr2nd_pdata; -} - -static const struct devs_id lis331dl_dev_id __initconst = { - .name = "i2c_accel", - .type = SFI_DEV_TYPE_I2C, - .get_platform_data = &lis331dl_platform_data, -}; - -sfi_device(lis331dl_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c deleted file mode 100644 index e9287c3184da..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_max7315.c: max7315 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/init.h> -#include <linux/gpio.h> -#include <linux/i2c.h> -#include <linux/platform_data/pca953x.h> -#include <asm/intel-mid.h> - -#define MAX7315_NUM 2 - -static void __init *max7315_platform_data(void *info) -{ - static struct pca953x_platform_data max7315_pdata[MAX7315_NUM]; - static int nr; - struct pca953x_platform_data *max7315 = &max7315_pdata[nr]; - struct i2c_board_info *i2c_info = info; - int gpio_base, intr; - char base_pin_name[SFI_NAME_LEN + 1]; - char intr_pin_name[SFI_NAME_LEN + 1]; - - if (nr == MAX7315_NUM) { - pr_err("too many max7315s, we only support %d\n", - MAX7315_NUM); - return NULL; - } - /* we have several max7315 on the board, we only need load several - * instances of the same pca953x driver to cover them - */ - strcpy(i2c_info->type, "max7315"); - if (nr++) { - snprintf(base_pin_name, sizeof(base_pin_name), - "max7315_%d_base", nr); - snprintf(intr_pin_name, sizeof(intr_pin_name), - "max7315_%d_int", nr); - } else { - strcpy(base_pin_name, "max7315_base"); - strcpy(intr_pin_name, "max7315_int"); - } - - gpio_base = get_gpio_by_name(base_pin_name); - intr = get_gpio_by_name(intr_pin_name); - - if (gpio_base < 0) - return NULL; - max7315->gpio_base = gpio_base; - if (intr != -1) { - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; - } else { - i2c_info->irq = -1; - max7315->irq_base = -1; - } - return max7315; -} - -static const struct devs_id max7315_dev_id __initconst = { - .name = "i2c_max7315", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &max7315_platform_data, -}; - -static const struct devs_id max7315_2_dev_id __initconst = { - .name = "i2c_max7315_2", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &max7315_platform_data, -}; - -sfi_device(max7315_dev_id); -sfi_device(max7315_2_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c deleted file mode 100644 index 28a182713934..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_mpu3050.c: mpu3050 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/gpio.h> -#include <linux/i2c.h> -#include <asm/intel-mid.h> - -static void *mpu3050_platform_data(void *info) -{ - struct i2c_board_info *i2c_info = info; - int intr = get_gpio_by_name("mpu3050_int"); - - if (intr < 0) - return NULL; - - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - return NULL; -} - -static const struct devs_id mpu3050_dev_id __initconst = { - .name = "mpu3050", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &mpu3050_platform_data, -}; - -sfi_device(mpu3050_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c deleted file mode 100644 index 605e1f94ad89..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel Merrifield FLIS platform device initialization file - * - * Copyright (C) 2016, Intel Corporation - * - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/init.h> -#include <linux/ioport.h> -#include <linux/platform_device.h> - -#include <asm/intel-mid.h> - -#define FLIS_BASE_ADDR 0xff0c0000 -#define FLIS_LENGTH 0x8000 - -static struct resource mrfld_pinctrl_mmio_resource = { - .start = FLIS_BASE_ADDR, - .end = FLIS_BASE_ADDR + FLIS_LENGTH - 1, - .flags = IORESOURCE_MEM, -}; - -static struct platform_device mrfld_pinctrl_device = { - .name = "pinctrl-merrifield", - .id = PLATFORM_DEVID_NONE, - .resource = &mrfld_pinctrl_mmio_resource, - .num_resources = 1, -}; - -static int __init mrfld_pinctrl_init(void) -{ - if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) - return platform_device_register(&mrfld_pinctrl_device); - - return -ENODEV; -} -arch_initcall(mrfld_pinctrl_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c deleted file mode 100644 index ec2afb41b34a..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel Merrifield power button support - * - * (C) Copyright 2017 Intel Corporation - * - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/init.h> -#include <linux/ioport.h> -#include <linux/platform_device.h> -#include <linux/sfi.h> - -#include <asm/intel-mid.h> -#include <asm/intel_scu_ipc.h> - -static struct resource mrfld_power_btn_resources[] = { - { - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device mrfld_power_btn_dev = { - .name = "msic_power_btn", - .id = PLATFORM_DEVID_NONE, - .num_resources = ARRAY_SIZE(mrfld_power_btn_resources), - .resource = mrfld_power_btn_resources, -}; - -static int mrfld_power_btn_scu_status_change(struct notifier_block *nb, - unsigned long code, void *data) -{ - if (code == SCU_DOWN) { - platform_device_unregister(&mrfld_power_btn_dev); - return 0; - } - - return platform_device_register(&mrfld_power_btn_dev); -} - -static struct notifier_block mrfld_power_btn_scu_notifier = { - .notifier_call = mrfld_power_btn_scu_status_change, -}; - -static int __init register_mrfld_power_btn(void) -{ - if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) - return -ENODEV; - - /* - * We need to be sure that the SCU IPC is ready before - * PMIC power button device can be registered: - */ - intel_scu_notifier_add(&mrfld_power_btn_scu_notifier); - - return 0; -} -arch_initcall(register_mrfld_power_btn); - -static void __init *mrfld_power_btn_platform_data(void *info) -{ - struct resource *res = mrfld_power_btn_resources; - struct sfi_device_table_entry *pentry = info; - - res->start = res->end = pentry->irq; - return NULL; -} - -static const struct devs_id mrfld_power_btn_dev_id __initconst = { - .name = "bcove_power_btn", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &mrfld_power_btn_platform_data, -}; - -sfi_device(mrfld_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c deleted file mode 100644 index 40e9808a9634..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel Merrifield legacy RTC initialization file - * - * (C) Copyright 2017 Intel Corporation - * - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/init.h> - -#include <asm/hw_irq.h> -#include <asm/intel-mid.h> -#include <asm/io_apic.h> -#include <asm/time.h> -#include <asm/x86_init.h> - -static int __init mrfld_legacy_rtc_alloc_irq(void) -{ - struct irq_alloc_info info; - int ret; - - if (!x86_platform.legacy.rtc) - return -ENODEV; - - ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0); - ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info); - if (ret < 0) { - pr_info("Failed to allocate RTC interrupt. Disabling RTC\n"); - x86_platform.legacy.rtc = 0; - return ret; - } - - return 0; -} - -static int __init mrfld_legacy_rtc_init(void) -{ - if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) - return -ENODEV; - - return mrfld_legacy_rtc_alloc_irq(); -} -arch_initcall(mrfld_legacy_rtc_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c deleted file mode 100644 index fe3b7ff975f3..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * SDHCI platform data initilisation file - * - * (C) Copyright 2016 Intel Corporation - * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - */ - -#include <linux/init.h> -#include <linux/pci.h> - -#include <linux/mmc/sdhci-pci-data.h> - -#include <asm/intel-mid.h> - -#define INTEL_MRFLD_SD 2 -#define INTEL_MRFLD_SD_CD_GPIO 77 - -static struct sdhci_pci_data mrfld_sdhci_pci_data = { - .rst_n_gpio = -EINVAL, - .cd_gpio = INTEL_MRFLD_SD_CD_GPIO, -}; - -static struct sdhci_pci_data * -mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno) -{ - unsigned int func = PCI_FUNC(pdev->devfn); - - if (func == INTEL_MRFLD_SD) - return &mrfld_sdhci_pci_data; - - return NULL; -} - -static int __init mrfld_sd_init(void) -{ - if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) - return -ENODEV; - - sdhci_pci_get_data = mrfld_sdhci_pci_get_data; - return 0; -} -arch_initcall(mrfld_sd_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c deleted file mode 100644 index b828f4fd40be..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * spidev platform data initialization file - * - * (C) Copyright 2014, 2016 Intel Corporation - * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - * Dan O'Donovan <dan@emutex.com> - */ - -#include <linux/err.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/spi/pxa2xx_spi.h> -#include <linux/spi/spi.h> - -#include <asm/intel-mid.h> - -#define MRFLD_SPI_DEFAULT_DMA_BURST 8 -#define MRFLD_SPI_DEFAULT_TIMEOUT 500 - -/* GPIO pin for spidev chipselect */ -#define MRFLD_SPIDEV_GPIO_CS 111 - -static struct pxa2xx_spi_chip spidev_spi_chip = { - .dma_burst_size = MRFLD_SPI_DEFAULT_DMA_BURST, - .timeout = MRFLD_SPI_DEFAULT_TIMEOUT, - .gpio_cs = MRFLD_SPIDEV_GPIO_CS, -}; - -static void __init *spidev_platform_data(void *info) -{ - struct spi_board_info *spi_info = info; - - if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) - return ERR_PTR(-ENODEV); - - spi_info->mode = SPI_MODE_0; - spi_info->controller_data = &spidev_spi_chip; - - return NULL; -} - -static const struct devs_id spidev_dev_id __initconst = { - .name = "spidev", - .type = SFI_DEV_TYPE_SPI, - .delay = 0, - .get_platform_data = &spidev_platform_data, -}; - -sfi_device(spidev_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c deleted file mode 100644 index 227218a8f98e..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel Merrifield watchdog platform device library file - * - * (C) Copyright 2014 Intel Corporation - * Author: David Cohen <david.a.cohen@linux.intel.com> - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/platform_device.h> -#include <linux/platform_data/intel-mid_wdt.h> - -#include <asm/intel-mid.h> -#include <asm/intel_scu_ipc.h> -#include <asm/io_apic.h> -#include <asm/hw_irq.h> - -#define TANGIER_EXT_TIMER0_MSI 12 - -static struct platform_device wdt_dev = { - .name = "intel_mid_wdt", - .id = -1, -}; - -static int tangier_probe(struct platform_device *pdev) -{ - struct irq_alloc_info info; - struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - int gsi = TANGIER_EXT_TIMER0_MSI; - int irq; - - if (!pdata) - return -EINVAL; - - /* IOAPIC builds identity mapping between GSI and IRQ on MID */ - ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); - if (irq < 0) { - dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); - return irq; - } - - pdata->irq = irq; - return 0; -} - -static struct intel_mid_wdt_pdata tangier_pdata = { - .probe = tangier_probe, -}; - -static int wdt_scu_status_change(struct notifier_block *nb, - unsigned long code, void *data) -{ - if (code == SCU_DOWN) { - platform_device_unregister(&wdt_dev); - return 0; - } - - return platform_device_register(&wdt_dev); -} - -static struct notifier_block wdt_scu_notifier = { - .notifier_call = wdt_scu_status_change, -}; - -static int __init register_mid_wdt(void) -{ - if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) - return -ENODEV; - - wdt_dev.dev.platform_data = &tangier_pdata; - - /* - * We need to be sure that the SCU IPC is ready before watchdog device - * can be registered: - */ - intel_scu_notifier_add(&wdt_scu_notifier); - - return 0; -} -arch_initcall(register_mid_wdt); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c deleted file mode 100644 index b17783d0d4e7..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic.c: MSIC platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel_scu_ipc.h> -#include <asm/intel-mid.h> -#include "platform_msic.h" - -struct intel_msic_platform_data msic_pdata; - -static struct resource msic_resources[] = { - { - .start = INTEL_MSIC_IRQ_PHYS_BASE, - .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1, - .flags = IORESOURCE_MEM, - }, -}; - -static struct platform_device msic_device = { - .name = "intel_msic", - .id = -1, - .dev = { - .platform_data = &msic_pdata, - }, - .num_resources = ARRAY_SIZE(msic_resources), - .resource = msic_resources, -}; - -static int msic_scu_status_change(struct notifier_block *nb, - unsigned long code, void *data) -{ - if (code == SCU_DOWN) { - platform_device_unregister(&msic_device); - return 0; - } - - return platform_device_register(&msic_device); -} - -static int __init msic_init(void) -{ - static struct notifier_block msic_scu_notifier = { - .notifier_call = msic_scu_status_change, - }; - - /* - * We need to be sure that the SCU IPC is ready before MSIC device - * can be registered. - */ - if (intel_mid_has_msic()) - intel_scu_notifier_add(&msic_scu_notifier); - - return 0; -} -arch_initcall(msic_init); - -/* - * msic_generic_platform_data - sets generic platform data for the block - * @info: pointer to the SFI device table entry for this block - * @block: MSIC block - * - * Function sets IRQ number from the SFI table entry for given device to - * the MSIC platform data. - */ -void *msic_generic_platform_data(void *info, enum intel_msic_block block) -{ - struct sfi_device_table_entry *entry = info; - - BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST); - msic_pdata.irq[block] = entry->irq; - - return NULL; -} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h deleted file mode 100644 index 91deb2e65b0e..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * platform_msic.h: MSIC platform data header file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ -#ifndef _PLATFORM_MSIC_H_ -#define _PLATFORM_MSIC_H_ - -extern struct intel_msic_platform_data msic_pdata; - -void *msic_generic_platform_data(void *info, enum intel_msic_block block); - -#endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c deleted file mode 100644 index e765da78ad8c..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_audio.c: MSIC audio platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/platform_device.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void *msic_audio_platform_data(void *info) -{ - struct platform_device *pdev; - - pdev = platform_device_register_simple("sst-platform", -1, NULL, 0); - - if (IS_ERR(pdev)) { - pr_err("failed to create audio platform device\n"); - return NULL; - } - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO); -} - -static const struct devs_id msic_audio_dev_id __initconst = { - .name = "msic_audio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_audio_platform_data, -}; - -sfi_device(msic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c deleted file mode 100644 index f461f84903f8..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_battery.c: MSIC battery platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void __init *msic_battery_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY); -} - -static const struct devs_id msic_battery_dev_id __initconst = { - .name = "msic_battery", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_battery_platform_data, -}; - -sfi_device(msic_battery_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c deleted file mode 100644 index 71a7d6db3878..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_gpio.c: MSIC GPIO platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/sfi.h> -#include <linux/init.h> -#include <linux/gpio.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void __init *msic_gpio_platform_data(void *info) -{ - static struct intel_msic_gpio_pdata msic_gpio_pdata; - - int gpio = get_gpio_by_name("msic_gpio_base"); - - if (gpio < 0) - return NULL; - - msic_gpio_pdata.gpio_base = gpio; - msic_pdata.gpio = &msic_gpio_pdata; - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO); -} - -static const struct devs_id msic_gpio_dev_id __initconst = { - .name = "msic_gpio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_gpio_platform_data, -}; - -sfi_device(msic_gpio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c deleted file mode 100644 index 558c0d974430..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_ocd.c: MSIC OCD platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/sfi.h> -#include <linux/init.h> -#include <linux/gpio.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void __init *msic_ocd_platform_data(void *info) -{ - static struct intel_msic_ocd_pdata msic_ocd_pdata; - int gpio; - - gpio = get_gpio_by_name("ocd_gpio"); - - if (gpio < 0) - return NULL; - - msic_ocd_pdata.gpio = gpio; - msic_pdata.ocd = &msic_ocd_pdata; - - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); -} - -static const struct devs_id msic_ocd_dev_id __initconst = { - .name = "msic_ocd", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_ocd_platform_data, -}; - -sfi_device(msic_ocd_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c deleted file mode 100644 index 3d3de2d59726..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_power_btn.c: MSIC power btn platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/sfi.h> -#include <linux/init.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void __init *msic_power_btn_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN); -} - -static const struct devs_id msic_power_btn_dev_id __initconst = { - .name = "msic_power_btn", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_power_btn_platform_data, -}; - -sfi_device(msic_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c deleted file mode 100644 index 4858da1d78c6..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_msic_thermal.c: msic_thermal platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/input.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/gpio.h> -#include <linux/platform_device.h> -#include <linux/mfd/intel_msic.h> -#include <asm/intel-mid.h> - -#include "platform_msic.h" - -static void __init *msic_thermal_platform_data(void *info) -{ - return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); -} - -static const struct devs_id msic_thermal_dev_id __initconst = { - .name = "msic_thermal", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .msic = 1, - .get_platform_data = &msic_thermal_platform_data, -}; - -sfi_device(msic_thermal_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c deleted file mode 100644 index 5609d8da3978..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * PCAL9555a platform data initialization file - * - * Copyright (C) 2016, Intel Corporation - * - * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com> - * Dan O'Donovan <dan@emutex.com> - */ - -#include <linux/gpio.h> -#include <linux/init.h> -#include <linux/i2c.h> -#include <linux/platform_data/pca953x.h> -#include <linux/sfi.h> - -#include <asm/intel-mid.h> - -#define PCAL9555A_NUM 4 - -static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM]; -static int nr; - -static void __init *pcal9555a_platform_data(void *info) -{ - struct i2c_board_info *i2c_info = info; - char *type = i2c_info->type; - struct pca953x_platform_data *pcal9555a; - char base_pin_name[SFI_NAME_LEN + 1]; - char intr_pin_name[SFI_NAME_LEN + 1]; - int gpio_base, intr; - - snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type); - snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type); - - gpio_base = get_gpio_by_name(base_pin_name); - intr = get_gpio_by_name(intr_pin_name); - - /* Check if the SFI record valid */ - if (gpio_base == -1) - return NULL; - - if (nr >= PCAL9555A_NUM) { - pr_err("%s: Too many instances, only %d supported\n", __func__, - PCAL9555A_NUM); - return NULL; - } - - pcal9555a = &pcal9555a_pdata[nr++]; - pcal9555a->gpio_base = gpio_base; - - if (intr >= 0) { - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; - } else { - i2c_info->irq = -1; - pcal9555a->irq_base = -1; - } - - strcpy(type, "pcal9555a"); - return pcal9555a; -} - -static const struct devs_id pcal9555a_1_dev_id __initconst = { - .name = "pcal9555a-1", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &pcal9555a_platform_data, -}; - -static const struct devs_id pcal9555a_2_dev_id __initconst = { - .name = "pcal9555a-2", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &pcal9555a_platform_data, -}; - -static const struct devs_id pcal9555a_3_dev_id __initconst = { - .name = "pcal9555a-3", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &pcal9555a_platform_data, -}; - -static const struct devs_id pcal9555a_4_dev_id __initconst = { - .name = "pcal9555a-4", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &pcal9555a_platform_data, -}; - -sfi_device(pcal9555a_1_dev_id); -sfi_device(pcal9555a_2_dev_id); -sfi_device(pcal9555a_3_dev_id); -sfi_device(pcal9555a_4_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c deleted file mode 100644 index 139738bbdd36..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_tc35876x.c: tc35876x platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/gpio/machine.h> -#include <asm/intel-mid.h> - -static struct gpiod_lookup_table tc35876x_gpio_table = { - .dev_id = "i2c_disp_brig", - .table = { - GPIO_LOOKUP("0000:00:0c.0", -1, "bridge-reset", GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("0000:00:0c.0", -1, "bl-en", GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("0000:00:0c.0", -1, "vadd", GPIO_ACTIVE_HIGH), - { }, - }, -}; - -/*tc35876x DSI_LVDS bridge chip and panel platform data*/ -static void *tc35876x_platform_data(void *data) -{ - struct gpiod_lookup_table *table = &tc35876x_gpio_table; - struct gpiod_lookup *lookup = table->table; - - lookup[0].chip_hwnum = get_gpio_by_name("LCMB_RXEN"); - lookup[1].chip_hwnum = get_gpio_by_name("6S6P_BL_EN"); - lookup[2].chip_hwnum = get_gpio_by_name("EN_VREG_LCD_V3P3"); - gpiod_add_lookup_table(table); - - return NULL; -} - -static const struct devs_id tc35876x_dev_id __initconst = { - .name = "i2c_disp_brig", - .type = SFI_DEV_TYPE_I2C, - .get_platform_data = &tc35876x_platform_data, -}; - -sfi_device(tc35876x_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c deleted file mode 100644 index e689d8f61059..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * platform_tca6416.c: tca6416 platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/platform_data/pca953x.h> -#include <linux/i2c.h> -#include <linux/gpio.h> -#include <asm/intel-mid.h> - -#define TCA6416_NAME "tca6416" -#define TCA6416_BASE "tca6416_base" -#define TCA6416_INTR "tca6416_int" - -static void *tca6416_platform_data(void *info) -{ - static struct pca953x_platform_data tca6416; - struct i2c_board_info *i2c_info = info; - int gpio_base, intr; - char base_pin_name[SFI_NAME_LEN + 1]; - char intr_pin_name[SFI_NAME_LEN + 1]; - - strcpy(i2c_info->type, TCA6416_NAME); - strcpy(base_pin_name, TCA6416_BASE); - strcpy(intr_pin_name, TCA6416_INTR); - - gpio_base = get_gpio_by_name(base_pin_name); - intr = get_gpio_by_name(intr_pin_name); - - if (gpio_base < 0) - return NULL; - tca6416.gpio_base = gpio_base; - if (intr >= 0) { - i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; - tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; - } else { - i2c_info->irq = -1; - tca6416.irq_base = -1; - } - return &tca6416; -} - -static const struct devs_id tca6416_dev_id __initconst = { - .name = "tca6416", - .type = SFI_DEV_TYPE_I2C, - .delay = 1, - .get_platform_data = &tca6416_platform_data, -}; - -sfi_device(tca6416_dev_id); diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 780728161f7d..f4592dc7a1c1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * intel-mid.c: Intel MID platform setup code + * Intel MID platform setup code * - * (C) Copyright 2008, 2012 Intel Corporation + * (C) Copyright 2008, 2012, 2021 Intel Corporation * Author: Jacob Pan (jacob.jun.pan@intel.com) * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> */ @@ -14,7 +14,6 @@ #include <linux/interrupt.h> #include <linux/regulator/machine.h> #include <linux/scatterlist.h> -#include <linux/sfi.h> #include <linux/irq.h> #include <linux/export.h> #include <linux/notifier.h> @@ -25,38 +24,13 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> #include <asm/io.h> #include <asm/i8259.h> #include <asm/intel_scu_ipc.h> -#include <asm/apb_timer.h> #include <asm/reboot.h> -/* - * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, - * cmdline option x86_intel_mid_timer can be used to override the configuration - * to prefer one or the other. - * at runtime, there are basically three timer configurations: - * 1. per cpu apbt clock only - * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only - * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. - * - * by default (without cmdline option), platform code first detects cpu type - * to see if we are on lincroft or penwell, then set up both lapic or apbt - * clocks accordingly. - * i.e. by default, medfield uses configuration #2, moorestown uses #1. - * config #3 is supported but not recommended on medfield. - * - * rating and feature summary: - * lapic (with C3STOP) --------- 100 - * apbt (always-on) ------------ 110 - * lapic (always-on,ARAT) ------ 150 - */ - -enum intel_mid_timer_options intel_mid_timer_options; - -enum intel_mid_cpu_type __intel_mid_cpu_chip; -EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); +#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ +#define IPCMSG_COLD_RESET 0xF1 static void intel_mid_power_off(void) { @@ -64,69 +38,32 @@ static void intel_mid_power_off(void) intel_mid_pwr_power_off(); /* Only for Tangier, the rest will ignore this command */ - intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1); + intel_scu_ipc_dev_simple_command(NULL, IPCMSG_COLD_OFF, 1); }; static void intel_mid_reboot(void) { - intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); -} - -static void __init intel_mid_setup_bp_timer(void) -{ - apbt_time_init(); - setup_boot_APIC_clock(); + intel_scu_ipc_dev_simple_command(NULL, IPCMSG_COLD_RESET, 0); } static void __init intel_mid_time_init(void) { - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); - - switch (intel_mid_timer_options) { - case INTEL_MID_TIMER_APBT_ONLY: - break; - case INTEL_MID_TIMER_LAPIC_APBT: - /* Use apbt and local apic */ - x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - return; - default: - if (!boot_cpu_has(X86_FEATURE_ARAT)) - break; - /* Lapic only, no apbt */ - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - return; - } - - x86_init.timers.setup_percpu_clockev = apbt_time_init; + /* Lapic only, no apbt */ + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; } static void intel_mid_arch_setup(void) { - if (boot_cpu_data.x86 != 6) { - pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; - goto out; - } - switch (boot_cpu_data.x86_model) { - case 0x35: - __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW; - break; case 0x3C: case 0x4A: - __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER; x86_platform.legacy.rtc = 1; break; - case 0x27: default: - __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; break; } -out: /* * Intel MID platforms are using explicitly defined regulators. * @@ -159,14 +96,11 @@ void __init x86_intel_mid_early_setup(void) x86_init.timers.timer_init = intel_mid_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; - x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.oem.arch_setup = intel_mid_arch_setup; - x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.arch_init = intel_mid_pci_init; @@ -188,25 +122,3 @@ void __init x86_intel_mid_early_setup(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; set_bit(MP_BUS_ISA, mp_bus_not_pci); } - -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_intel_mid_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - intel_mid_timer_options = INTEL_MID_TIMER_APBT_ONLY; - else if (strcmp("lapic_and_apbt", arg) == 0) - intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT; - else { - pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer); diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c deleted file mode 100644 index 2226da4f437a..000000000000 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * intel_mid_vrtc.c: Driver for virtual RTC device on Intel MID platform - * - * (C) Copyright 2009 Intel Corporation - * - * Note: - * VRTC is emulated by system controller firmware, the real HW - * RTC is located in the PMIC device. SCU FW shadows PMIC RTC - * in a memory mapped IO space that is visible to the host IA - * processor. - * - * This driver is based on RTC CMOS driver. - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/platform_device.h> -#include <linux/mc146818rtc.h> - -#include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> -#include <asm/time.h> -#include <asm/fixmap.h> - -static unsigned char __iomem *vrtc_virt_base; - -unsigned char vrtc_cmos_read(unsigned char reg) -{ - unsigned char retval; - - /* vRTC's registers range from 0x0 to 0xD */ - if (reg > 0xd || !vrtc_virt_base) - return 0xff; - - lock_cmos_prefix(reg); - retval = __raw_readb(vrtc_virt_base + (reg << 2)); - lock_cmos_suffix(reg); - return retval; -} -EXPORT_SYMBOL_GPL(vrtc_cmos_read); - -void vrtc_cmos_write(unsigned char val, unsigned char reg) -{ - if (reg > 0xd || !vrtc_virt_base) - return; - - lock_cmos_prefix(reg); - __raw_writeb(val, vrtc_virt_base + (reg << 2)); - lock_cmos_suffix(reg); -} -EXPORT_SYMBOL_GPL(vrtc_cmos_write); - -void vrtc_get_time(struct timespec64 *now) -{ - u8 sec, min, hour, mday, mon; - unsigned long flags; - u32 year; - - spin_lock_irqsave(&rtc_lock, flags); - - while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - sec = vrtc_cmos_read(RTC_SECONDS); - min = vrtc_cmos_read(RTC_MINUTES); - hour = vrtc_cmos_read(RTC_HOURS); - mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); - mon = vrtc_cmos_read(RTC_MONTH); - year = vrtc_cmos_read(RTC_YEAR); - - spin_unlock_irqrestore(&rtc_lock, flags); - - /* vRTC YEAR reg contains the offset to 1972 */ - year += 1972; - - pr_info("vRTC: sec: %d min: %d hour: %d day: %d " - "mon: %d year: %d\n", sec, min, hour, mday, mon, year); - - now->tv_sec = mktime64(year, mon, mday, hour, min, sec); - now->tv_nsec = 0; -} - -int vrtc_set_mmss(const struct timespec64 *now) -{ - unsigned long flags; - struct rtc_time tm; - int year; - int retval = 0; - - rtc_time64_to_tm(now->tv_sec, &tm); - if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) { - /* - * tm.year is the number of years since 1900, and the - * vrtc need the years since 1972. - */ - year = tm.tm_year - 72; - spin_lock_irqsave(&rtc_lock, flags); - vrtc_cmos_write(year, RTC_YEAR); - vrtc_cmos_write(tm.tm_mon, RTC_MONTH); - vrtc_cmos_write(tm.tm_mday, RTC_DAY_OF_MONTH); - vrtc_cmos_write(tm.tm_hour, RTC_HOURS); - vrtc_cmos_write(tm.tm_min, RTC_MINUTES); - vrtc_cmos_write(tm.tm_sec, RTC_SECONDS); - spin_unlock_irqrestore(&rtc_lock, flags); - } else { - pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n", - __func__, (s64)now->tv_sec); - retval = -EINVAL; - } - return retval; -} - -void __init intel_mid_rtc_init(void) -{ - unsigned long vrtc_paddr; - - sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); - - vrtc_paddr = sfi_mrtc_array[0].phys_addr; - if (!sfi_mrtc_num || !vrtc_paddr) - return; - - vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC, - vrtc_paddr); - x86_platform.get_wallclock = vrtc_get_time; - x86_platform.set_wallclock = vrtc_set_mmss; -} - -/* - * The Moorestown platform has a memory mapped virtual RTC device that emulates - * the programming interface of the RTC. - */ - -static struct resource vrtc_resources[] = { - [0] = { - .flags = IORESOURCE_MEM, - }, - [1] = { - .flags = IORESOURCE_IRQ, - } -}; - -static struct platform_device vrtc_device = { - .name = "rtc_mrst", - .id = -1, - .resource = vrtc_resources, - .num_resources = ARRAY_SIZE(vrtc_resources), -}; - -/* Register the RTC device if appropriate */ -static int __init intel_mid_device_create(void) -{ - /* No Moorestown, no device */ - if (!intel_mid_identify_cpu()) - return -ENODEV; - /* No timer, no device */ - if (!sfi_mrtc_num) - return -ENODEV; - - /* iomem resource */ - vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr; - vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr + - MRST_VRTC_MAP_SZ; - /* irq resource */ - vrtc_resources[1].start = sfi_mrtc_array[0].irq; - vrtc_resources[1].end = sfi_mrtc_array[0].irq; - - return platform_device_register(&vrtc_device); -} -device_initcall(intel_mid_device_create); diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c deleted file mode 100644 index 30bd5714a3d4..000000000000 --- a/arch/x86/platform/intel-mid/sfi.c +++ /dev/null @@ -1,543 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * intel_mid_sfi.c: Intel MID SFI initialization code - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/sfi.h> -#include <linux/spi/spi.h> -#include <linux/i2c.h> -#include <linux/skbuff.h> -#include <linux/gpio.h> -#include <linux/gpio_keys.h> -#include <linux/input.h> -#include <linux/platform_device.h> -#include <linux/irq.h> -#include <linux/export.h> -#include <linux/notifier.h> -#include <linux/mmc/core.h> -#include <linux/mmc/card.h> -#include <linux/blkdev.h> - -#include <asm/setup.h> -#include <asm/mpspec_def.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/io_apic.h> -#include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> -#include <asm/io.h> -#include <asm/i8259.h> -#include <asm/intel_scu_ipc.h> -#include <asm/apb_timer.h> -#include <asm/reboot.h> - -#define SFI_SIG_OEM0 "OEM0" -#define MAX_IPCDEVS 24 -#define MAX_SCU_SPI 24 -#define MAX_SCU_I2C 24 - -static struct platform_device *ipc_devs[MAX_IPCDEVS]; -static struct spi_board_info *spi_devs[MAX_SCU_SPI]; -static struct i2c_board_info *i2c_devs[MAX_SCU_I2C]; -static struct sfi_gpio_table_entry *gpio_table; -static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -static int ipc_next_dev; -static int spi_next_dev; -static int i2c_next_dev; -static int i2c_bus[MAX_SCU_I2C]; -static int gpio_num_entry; -static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; -int sfi_mrtc_num; -int sfi_mtimer_num; - -struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; -EXPORT_SYMBOL_GPL(sfi_mrtc_array); - -struct blocking_notifier_head intel_scu_notifier = - BLOCKING_NOTIFIER_INIT(intel_scu_notifier); -EXPORT_SYMBOL_GPL(intel_scu_notifier); - -#define intel_mid_sfi_get_pdata(dev, priv) \ - ((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL) - -/* parse all the mtimer info to a static mtimer array */ -int __init sfi_parse_mtmr(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_timer_table_entry *pentry; - struct mpc_intsrc mp_irq; - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mtimer_num) { - sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_timer_table_entry); - pentry = (struct sfi_timer_table_entry *) sb->pentry; - totallen = sfi_mtimer_num * sizeof(*pentry); - memcpy(sfi_mtimer_array, pentry, totallen); - } - - pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num); - pentry = sfi_mtimer_array; - for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", - totallen, (u32)pentry->phys_addr, - pentry->freq_hz, pentry->irq); - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = MP_IRQTRIG_EDGE | MP_IRQPOL_ACTIVE_HIGH; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); - } - - return 0; -} - -struct sfi_timer_table_entry *sfi_get_mtmr(int hint) -{ - int i; - if (hint < sfi_mtimer_num) { - if (!sfi_mtimer_usage[hint]) { - pr_debug("hint taken for timer %d irq %d\n", - hint, sfi_mtimer_array[hint].irq); - sfi_mtimer_usage[hint] = 1; - return &sfi_mtimer_array[hint]; - } - } - /* take the first timer available */ - for (i = 0; i < sfi_mtimer_num;) { - if (!sfi_mtimer_usage[i]) { - sfi_mtimer_usage[i] = 1; - return &sfi_mtimer_array[i]; - } - i++; - } - return NULL; -} - -void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) -{ - int i; - for (i = 0; i < sfi_mtimer_num;) { - if (mtmr->irq == sfi_mtimer_array[i].irq) { - sfi_mtimer_usage[i] = 0; - return; - } - i++; - } -} - -/* parse all the mrtc info to a global mrtc array */ -int __init sfi_parse_mrtc(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_rtc_table_entry *pentry; - struct mpc_intsrc mp_irq; - - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mrtc_num) { - sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_rtc_table_entry); - pentry = (struct sfi_rtc_table_entry *)sb->pentry; - totallen = sfi_mrtc_num * sizeof(*pentry); - memcpy(sfi_mrtc_array, pentry, totallen); - } - - pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num); - pentry = sfi_mrtc_array; - for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", - totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); - } - return 0; -} - - -/* - * Parsing GPIO table first, since the DEVS table will need this table - * to map the pin name to the actual pin. - */ -static int __init sfi_parse_gpio(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_gpio_table_entry *pentry; - int num, i; - - if (gpio_table) - return 0; - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); - pentry = (struct sfi_gpio_table_entry *)sb->pentry; - - gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL); - if (!gpio_table) - return -1; - gpio_num_entry = num; - - pr_debug("GPIO pin info:\n"); - for (i = 0; i < num; i++, pentry++) - pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s," - " pin = %d\n", i, - pentry->controller_name, - pentry->pin_name, - pentry->pin_no); - return 0; -} - -int get_gpio_by_name(const char *name) -{ - struct sfi_gpio_table_entry *pentry = gpio_table; - int i; - - if (!pentry) - return -1; - for (i = 0; i < gpio_num_entry; i++, pentry++) { - if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) - return pentry->pin_no; - } - return -EINVAL; -} - -static void __init intel_scu_ipc_device_register(struct platform_device *pdev) -{ - if (ipc_next_dev == MAX_IPCDEVS) - pr_err("too many SCU IPC devices"); - else - ipc_devs[ipc_next_dev++] = pdev; -} - -static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) -{ - struct spi_board_info *new_dev; - - if (spi_next_dev == MAX_SCU_SPI) { - pr_err("too many SCU SPI devices"); - return; - } - - new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL); - if (!new_dev) { - pr_err("failed to alloc mem for delayed spi dev %s\n", - sdev->modalias); - return; - } - *new_dev = *sdev; - - spi_devs[spi_next_dev++] = new_dev; -} - -static void __init intel_scu_i2c_device_register(int bus, - struct i2c_board_info *idev) -{ - struct i2c_board_info *new_dev; - - if (i2c_next_dev == MAX_SCU_I2C) { - pr_err("too many SCU I2C devices"); - return; - } - - new_dev = kzalloc(sizeof(*idev), GFP_KERNEL); - if (!new_dev) { - pr_err("failed to alloc mem for delayed i2c dev %s\n", - idev->type); - return; - } - *new_dev = *idev; - - i2c_bus[i2c_next_dev] = bus; - i2c_devs[i2c_next_dev++] = new_dev; -} - -/* Called by IPC driver */ -void intel_scu_devices_create(void) -{ - int i; - - for (i = 0; i < ipc_next_dev; i++) - platform_device_add(ipc_devs[i]); - - for (i = 0; i < spi_next_dev; i++) - spi_register_board_info(spi_devs[i], 1); - - for (i = 0; i < i2c_next_dev; i++) { - struct i2c_adapter *adapter; - struct i2c_client *client; - - adapter = i2c_get_adapter(i2c_bus[i]); - if (adapter) { - client = i2c_new_client_device(adapter, i2c_devs[i]); - if (IS_ERR(client)) - pr_err("can't create i2c device %s\n", - i2c_devs[i]->type); - } else - i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); - } - intel_scu_notifier_post(SCU_AVAILABLE, NULL); -} -EXPORT_SYMBOL_GPL(intel_scu_devices_create); - -/* Called by IPC driver */ -void intel_scu_devices_destroy(void) -{ - int i; - - intel_scu_notifier_post(SCU_DOWN, NULL); - - for (i = 0; i < ipc_next_dev; i++) - platform_device_del(ipc_devs[i]); -} -EXPORT_SYMBOL_GPL(intel_scu_devices_destroy); - -static void __init install_irq_resource(struct platform_device *pdev, int irq) -{ - /* Single threaded */ - static struct resource res __initdata = { - .name = "IRQ", - .flags = IORESOURCE_IRQ, - }; - res.start = irq; - platform_device_add_resources(pdev, &res, 1); -} - -static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct platform_device *pdev; - void *pdata = NULL; - - pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", - pentry->name, pentry->irq); - - /* - * We need to call platform init of IPC devices to fill misc_pdata - * structure. It will be used in msic_init for initialization. - */ - pdata = intel_mid_sfi_get_pdata(dev, pentry); - if (IS_ERR(pdata)) - return; - - /* - * On Medfield the platform device creation is handled by the MSIC - * MFD driver so we don't need to do it here. - */ - if (dev->msic && intel_mid_has_msic()) - return; - - pdev = platform_device_alloc(pentry->name, 0); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - pentry->name); - return; - } - install_irq_resource(pdev, pentry->irq); - - pdev->dev.platform_data = pdata; - if (dev->delay) - intel_scu_ipc_device_register(pdev); - else - platform_device_add(pdev); -} - -static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct spi_board_info spi_info; - void *pdata = NULL; - - memset(&spi_info, 0, sizeof(spi_info)); - strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); - spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq); - spi_info.bus_num = pentry->host_num; - spi_info.chip_select = pentry->addr; - spi_info.max_speed_hz = pentry->max_freq; - pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n", - spi_info.bus_num, - spi_info.modalias, - spi_info.irq, - spi_info.max_speed_hz, - spi_info.chip_select); - - pdata = intel_mid_sfi_get_pdata(dev, &spi_info); - if (IS_ERR(pdata)) - return; - - spi_info.platform_data = pdata; - if (dev->delay) - intel_scu_spi_device_register(&spi_info); - else - spi_register_board_info(&spi_info, 1); -} - -static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct i2c_board_info i2c_info; - void *pdata = NULL; - - memset(&i2c_info, 0, sizeof(i2c_info)); - strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); - i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq); - i2c_info.addr = pentry->addr; - pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n", - pentry->host_num, - i2c_info.type, - i2c_info.irq, - i2c_info.addr); - pdata = intel_mid_sfi_get_pdata(dev, &i2c_info); - i2c_info.platform_data = pdata; - if (IS_ERR(pdata)) - return; - - if (dev->delay) - intel_scu_i2c_device_register(pentry->host_num, &i2c_info); - else - i2c_register_board_info(pentry->host_num, &i2c_info, 1); -} - -static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct mid_sd_board_info sd_info; - void *pdata; - - memset(&sd_info, 0, sizeof(sd_info)); - strncpy(sd_info.name, pentry->name, SFI_NAME_LEN); - sd_info.bus_num = pentry->host_num; - sd_info.max_clk = pentry->max_freq; - sd_info.addr = pentry->addr; - pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n", - sd_info.bus_num, - sd_info.name, - sd_info.max_clk, - sd_info.addr); - pdata = intel_mid_sfi_get_pdata(dev, &sd_info); - if (IS_ERR(pdata)) - return; - - /* Nothing we can do with this for now */ - sd_info.platform_data = pdata; - - pr_debug("Successfully registered %16.16s", sd_info.name); -} - -extern struct devs_id *const __x86_intel_mid_dev_start[], - *const __x86_intel_mid_dev_end[]; - -static struct devs_id __init *get_device_id(u8 type, char *name) -{ - struct devs_id *const *dev_table; - - for (dev_table = __x86_intel_mid_dev_start; - dev_table < __x86_intel_mid_dev_end; dev_table++) { - struct devs_id *dev = *dev_table; - if (dev->type == type && - !strncmp(dev->name, name, SFI_NAME_LEN)) { - return dev; - } - } - - return NULL; -} - -static int __init sfi_parse_devs(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_device_table_entry *pentry; - struct devs_id *dev = NULL; - int num, i, ret; - int polarity; - struct irq_alloc_info info; - - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); - pentry = (struct sfi_device_table_entry *)sb->pentry; - - for (i = 0; i < num; i++, pentry++) { - int irq = pentry->irq; - - if (irq != (u8)0xff) { /* native RTE case */ - /* these SPI2 devices are not exposed to system as PCI - * devices, but they have separate RTE entry in IOAPIC - * so we have to enable them one by one here - */ - if (intel_mid_identify_cpu() == - INTEL_MID_CPU_CHIP_TANGIER) { - if (!strncmp(pentry->name, "r69001-ts-i2c", 13)) - /* active low */ - polarity = 1; - else if (!strncmp(pentry->name, - "synaptics_3202", 14)) - /* active low */ - polarity = 1; - else if (irq == 41) - /* fast_int_1 */ - polarity = 1; - else - /* active high */ - polarity = 0; - } else { - /* PNW and CLV go with active low */ - polarity = 1; - } - - ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info); - WARN_ON(ret < 0); - } - - dev = get_device_id(pentry->type, pentry->name); - - if (!dev) - continue; - - switch (pentry->type) { - case SFI_DEV_TYPE_IPC: - sfi_handle_ipc_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SPI: - sfi_handle_spi_dev(pentry, dev); - break; - case SFI_DEV_TYPE_I2C: - sfi_handle_i2c_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SD: - sfi_handle_sd_dev(pentry, dev); - break; - case SFI_DEV_TYPE_UART: - case SFI_DEV_TYPE_HSI: - default: - break; - } - } - return 0; -} - -static int __init intel_mid_platform_init(void) -{ - sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio); - sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs); - return 0; -} -arch_initcall(intel_mid_platform_init); diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ac8578258af..b42bfdab01a9 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -27,7 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); -MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); static bool force; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 43b4d864817e..d2ccadc247e6 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -16,6 +16,7 @@ #include <asm/boot.h> #include <asm/processor-flags.h> #include <asm/msr.h> +#include <asm/nospec-branch.h> #include <xen/interface/elfnote.h> __HEAD @@ -105,6 +106,7 @@ SYM_CODE_START_LOCAL(pvh_start_xen) /* startup_64 expects boot_params in %rsi. */ mov $_pa(pvh_bootparams), %rsi mov $_pa(startup_64), %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax #else /* CONFIG_X86_64 */ diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile deleted file mode 100644 index 4eba24c2af67..000000000000 --- a/arch/x86/platform/sfi/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SFI) += sfi.o diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c deleted file mode 100644 index 6259563760f9..000000000000 --- a/arch/x86/platform/sfi/sfi.c +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sfi.c - x86 architecture SFI support. - * - * Copyright (c) 2009, Intel Corporation. - */ - -#define KMSG_COMPONENT "SFI" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/io.h> - -#include <asm/irqdomain.h> -#include <asm/io_apic.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> - -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; - -/* All CPUs enumerated by SFI must be present and enabled */ -static void __init mp_sfi_register_lapic(u8 id) -{ - if (MAX_LOCAL_APIC - id <= 0) { - pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC); - return; - } - - pr_info("registering lapic[%d]\n", id); - - generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); -} - -static int __init sfi_parse_cpus(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_cpu_table_entry *pentry; - int i; - int cpu_num; - - sb = (struct sfi_table_simple *)table; - cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); - pentry = (struct sfi_cpu_table_entry *)sb->pentry; - - for (i = 0; i < cpu_num; i++) { - mp_sfi_register_lapic(pentry->apic_id); - pentry++; - } - - smp_found_config = 1; - return 0; -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -#ifdef CONFIG_X86_IO_APIC - -static int __init sfi_parse_ioapic(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_apic_table_entry *pentry; - int i, num; - struct ioapic_domain_cfg cfg = { - .type = IOAPIC_DOMAIN_STRICT, - .ops = &mp_ioapic_irqdomain_ops, - }; - - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); - pentry = (struct sfi_apic_table_entry *)sb->pentry; - - for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg); - pentry++; - } - - WARN(pic_mode, KERN_WARNING - "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); - pic_mode = 0; - return 0; -} -#endif /* CONFIG_X86_IO_APIC */ - -/* - * sfi_platform_init(): register lapics & io-apics - */ -int __init sfi_platform_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - register_lapic_address(sfi_lapic_addr); - sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); -#endif -#ifdef CONFIG_X86_IO_APIC - sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); -#endif - return 0; -} diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile index 224ff0504890..1441dda8edf7 100644 --- a/arch/x86/platform/uv/Makefile +++ b/arch/x86/platform/uv/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o +obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_time.o uv_nmi.o diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 54511eaccf4d..bf31af3d32d6 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -72,6 +72,7 @@ static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); long system_serial_number; @@ -171,6 +172,60 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) (u64)decode, (u64)domain, (u64)bus, 0, 0); } +extern s64 uv_bios_get_master_nasid(u64 size, u64 *master_nasid) +{ + return uv_bios_call(UV_BIOS_EXTRA, 0, UV_BIOS_EXTRA_MASTER_NASID, 0, + size, (u64)master_nasid); +} +EXPORT_SYMBOL_GPL(uv_bios_get_master_nasid); + +extern s64 uv_bios_get_heapsize(u64 nasid, u64 size, u64 *heap_size) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_GET_HEAPSIZE, + 0, size, (u64)heap_size); +} +EXPORT_SYMBOL_GPL(uv_bios_get_heapsize); + +extern s64 uv_bios_install_heap(u64 nasid, u64 heap_size, u64 *bios_heap) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_INSTALL_HEAP, + 0, heap_size, (u64)bios_heap); +} +EXPORT_SYMBOL_GPL(uv_bios_install_heap); + +extern s64 uv_bios_obj_count(u64 nasid, u64 size, u64 *objcnt) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_OBJECT_COUNT, + 0, size, (u64)objcnt); +} +EXPORT_SYMBOL_GPL(uv_bios_obj_count); + +extern s64 uv_bios_enum_objs(u64 nasid, u64 size, u64 *objbuf) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_OBJECTS, + 0, size, (u64)objbuf); +} +EXPORT_SYMBOL_GPL(uv_bios_enum_objs); + +extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 size, u64 *portbuf) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_PORTS, + obj_id, size, (u64)portbuf); +} +EXPORT_SYMBOL_GPL(uv_bios_enum_ports); + +extern s64 uv_bios_get_geoinfo(u64 nasid, u64 size, u64 *buf) +{ + return uv_bios_call(UV_BIOS_GET_GEOINFO, nasid, (u64)buf, size, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_get_geoinfo); + +extern s64 uv_bios_get_pci_topology(u64 size, u64 *buf) +{ + return uv_bios_call(UV_BIOS_GET_PCI_TOPOLOGY, (u64)buf, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_get_pci_topology); + unsigned long get_uv_systab_phys(bool msg) { if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 18ca2261cc9a..1a536a187d74 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -35,8 +35,8 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; + entry->delivery_mode = apic->delivery_mode; + entry->dest_mode = apic->dest_mode_logical; entry->polarity = 0; entry->trigger = 0; entry->mask = 0; diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c deleted file mode 100644 index 266773e2fb37..000000000000 --- a/arch/x86/platform/uv/uv_sysfs.c +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson - */ - -#include <linux/device.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> - -struct kobject *sgi_uv_kobj; - -static ssize_t partition_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); -} - -static ssize_t coherence_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id); -} - -static struct kobj_attribute partition_id_attr = - __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); - -static struct kobj_attribute coherence_id_attr = - __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); - - -static int __init sgi_uv_sysfs_init(void) -{ - unsigned long ret; - - if (!is_uv_system()) - return -ENODEV; - - if (!sgi_uv_kobj) - sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); - if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); - return -EINVAL; - } - - ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); - return ret; - } - - ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); - return ret; - } - - return 0; -} - -device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 6907b523e856..379777572bc9 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,9 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected CFLAGS_cpu.o := -fno-stack-protector +# Clang may incorrectly inline functions with stack protector enabled into +# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479 +CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO) + obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 7918b8415f13..d9bed596d849 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -21,6 +21,53 @@ #include <asm/asm-offsets.h> #include <asm/processor-flags.h> #include <asm/frame.h> +#include <asm/nospec-branch.h> + + /* code below belongs to the image kernel */ + .align PAGE_SIZE +SYM_FUNC_START(restore_registers) + /* go back to the original page tables */ + movq %r9, %cr3 + + /* Flush TLB, including "global" things (vmalloc) */ + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(X86_CR4_PGE), %rdx + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3 + movq %rax, %cr4; # turn PGE back on + + /* We don't restore %rax, it must be 0 anyway */ + movq $saved_context, %rax + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx + movq pt_regs_r8(%rax), %r8 + movq pt_regs_r9(%rax), %r9 + movq pt_regs_r10(%rax), %r10 + movq pt_regs_r11(%rax), %r11 + movq pt_regs_r12(%rax), %r12 + movq pt_regs_r13(%rax), %r13 + movq pt_regs_r14(%rax), %r14 + movq pt_regs_r15(%rax), %r15 + pushq pt_regs_flags(%rax) + popfq + + /* Saved in save_processor_state. */ + lgdt saved_context_gdt_desc(%rax) + + xorl %eax, %eax + + /* tell the hibernation core that we've just restored the memory */ + movq %rax, in_suspend(%rip) + + ret +SYM_FUNC_END(restore_registers) SYM_FUNC_START(swsusp_arch_suspend) movq $saved_context, %rax @@ -52,7 +99,7 @@ SYM_FUNC_START(swsusp_arch_suspend) ret SYM_FUNC_END(swsusp_arch_suspend) -SYM_CODE_START(restore_image) +SYM_FUNC_START(restore_image) /* prepare to jump to the image kernel */ movq restore_jump_address(%rip), %r8 movq restore_cr3(%rip), %r9 @@ -66,11 +113,12 @@ SYM_CODE_START(restore_image) /* jump to relocated restore code */ movq relocated_restore_code(%rip), %rcx + ANNOTATE_RETPOLINE_SAFE jmpq *%rcx -SYM_CODE_END(restore_image) +SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ -SYM_CODE_START(core_restore_code) +SYM_FUNC_START(core_restore_code) /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ @@ -97,51 +145,6 @@ SYM_CODE_START(core_restore_code) .Ldone: /* jump to the restore_registers address from the image header */ + ANNOTATE_RETPOLINE_SAFE jmpq *%r8 -SYM_CODE_END(core_restore_code) - - /* code below belongs to the image kernel */ - .align PAGE_SIZE -SYM_FUNC_START(restore_registers) - /* go back to the original page tables */ - movq %r9, %cr3 - - /* Flush TLB, including "global" things (vmalloc) */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(X86_CR4_PGE), %rdx - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3 - movq %rax, %cr4; # turn PGE back on - - /* We don't restore %rax, it must be 0 anyway */ - movq $saved_context, %rax - movq pt_regs_sp(%rax), %rsp - movq pt_regs_bp(%rax), %rbp - movq pt_regs_si(%rax), %rsi - movq pt_regs_di(%rax), %rdi - movq pt_regs_bx(%rax), %rbx - movq pt_regs_cx(%rax), %rcx - movq pt_regs_dx(%rax), %rdx - movq pt_regs_r8(%rax), %r8 - movq pt_regs_r9(%rax), %r9 - movq pt_regs_r10(%rax), %r10 - movq pt_regs_r11(%rax), %r11 - movq pt_regs_r12(%rax), %r12 - movq pt_regs_r13(%rax), %r13 - movq pt_regs_r14(%rax), %r14 - movq pt_regs_r15(%rax), %r15 - pushq pt_regs_flags(%rax) - popfq - - /* Saved in save_processor_state. */ - lgdt saved_context_gdt_desc(%rax) - - xorl %eax, %eax - - /* tell the hibernation core that we've just restored the memory */ - movq %rax, in_suspend(%rip) - - ret -SYM_FUNC_END(restore_registers) +SYM_FUNC_END(core_restore_code) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 7b37a412f829..f03b64d9cb51 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,7 +9,7 @@ */ #include <linux/bug.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <asm/purgatory.h> #include "../boot/string.h" diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 55b1ab378974..bddfc9a46645 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -29,14 +29,14 @@ posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity hostprogs += insn_decoder_test insn_sanity # -I needed for generated C source and C source which in the kernel tree. -HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ +HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/ -HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ +HOSTCFLAGS_insn_sanity.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/ # Dependencies are also needed. -$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c +$(obj)/insn_decoder_test.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c -$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c +$(obj)/insn_sanity.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs += relocs diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 185ceba9d289..c6a0000ae635 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -14,10 +14,6 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> - -#define unlikely(cond) (cond) -#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) - #include <asm/insn.h> #include <inat.c> #include <insn.c> diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index ce7188cbdae5..04c5a44b9682 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" "__(start|end)_pci_.*|" "__(start|end)_builtin_fw|" - "__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|" - "__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|" + "__(start|stop)___ksymtab(|_gpl)|" + "__(start|stop)___kcrctab(|_gpl)|" "__(start|stop)___param|" "__(start|stop)___modver|" "__(start|stop)___bug_table|" @@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; @@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c index e62174638f00..1dc9adc20b1c 100644 --- a/arch/x86/um/os-Linux/task_size.c +++ b/arch/x86/um/os-Linux/task_size.c @@ -145,7 +145,7 @@ out: unsigned long os_get_top_address(void) { /* The old value of CONFIG_TOP_ADDR */ - return 0x7fc0000000; + return 0x7fc0002000; } #endif diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 51fd256c75f0..c3891c1ada26 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -7,8 +7,8 @@ #define __SYSDEP_STUB_H #include <asm/ptrace.h> +#include <generated/asm-offsets.h> -#define STUB_SYSCALL_RET EAX #define STUB_MMAP_NR __NR_mmap2 #define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT) @@ -77,17 +77,28 @@ static inline void trap_myself(void) __asm("int3"); } -static inline void remap_stack(int fd, unsigned long offset) +static void inline remap_stack_and_trap(void) { - __asm__ volatile ("movl %%eax,%%ebp ; movl %0,%%eax ; int $0x80 ;" - "movl %7, %%ebx ; movl %%eax, (%%ebx)" - : : "g" (STUB_MMAP_NR), "b" (STUB_DATA), - "c" (UM_KERN_PAGE_SIZE), - "d" (PROT_READ | PROT_WRITE), - "S" (MAP_FIXED | MAP_SHARED), "D" (fd), - "a" (offset), - "i" (&((struct stub_data *) STUB_DATA)->err) - : "memory"); + __asm__ volatile ( + "movl %%esp,%%ebx ;" + "andl %0,%%ebx ;" + "movl %1,%%eax ;" + "movl %%ebx,%%edi ; addl %2,%%edi ; movl (%%edi),%%edi ;" + "movl %%ebx,%%ebp ; addl %3,%%ebp ; movl (%%ebp),%%ebp ;" + "int $0x80 ;" + "addl %4,%%ebx ; movl %%eax, (%%ebx) ;" + "int $3" + : : + "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (STUB_MMAP_NR), + "g" (UML_STUB_FIELD_FD), + "g" (UML_STUB_FIELD_OFFSET), + "g" (UML_STUB_FIELD_CHILD_ERR), + "c" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE), + "S" (MAP_FIXED | MAP_SHARED) + : + "memory"); } #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 994df93c5ed3..6e2626b77a2e 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -7,8 +7,8 @@ #define __SYSDEP_STUB_H #include <sysdep/ptrace_user.h> +#include <generated/asm-offsets.h> -#define STUB_SYSCALL_RET PT_INDEX(RAX) #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) @@ -82,18 +82,30 @@ static inline void trap_myself(void) __asm("int3"); } -static inline void remap_stack(long fd, unsigned long offset) +static inline void remap_stack_and_trap(void) { - __asm__ volatile ("movq %4,%%r10 ; movq %5,%%r8 ; " - "movq %6, %%r9; " __syscall "; movq %7, %%rbx ; " - "movq %%rax, (%%rbx)": - : "a" (STUB_MMAP_NR), "D" (STUB_DATA), - "S" (UM_KERN_PAGE_SIZE), - "d" (PROT_READ | PROT_WRITE), - "g" (MAP_FIXED | MAP_SHARED), "g" (fd), - "g" (offset), - "i" (&((struct stub_data *) STUB_DATA)->err) - : __syscall_clobber, "r10", "r8", "r9" ); + __asm__ volatile ( + "movq %0,%%rax ;" + "movq %%rsp,%%rdi ;" + "andq %1,%%rdi ;" + "movq %2,%%r10 ;" + "movq %%rdi,%%r8 ; addq %3,%%r8 ; movq (%%r8),%%r8 ;" + "movq %%rdi,%%r9 ; addq %4,%%r9 ; movq (%%r9),%%r9 ;" + __syscall ";" + "movq %%rsp,%%rdi ; andq %1,%%rdi ;" + "addq %5,%%rdi ; movq %%rax, (%%rdi) ;" + "int3" + : : + "g" (STUB_MMAP_NR), + "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (MAP_FIXED | MAP_SHARED), + "g" (UML_STUB_FIELD_FD), + "g" (UML_STUB_FIELD_OFFSET), + "g" (UML_STUB_FIELD_CHILD_ERR), + "S" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE) + : + __syscall_clobber, "r10", "r8", "r9"); } #endif diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S index a193e88536a9..8291899e6aaf 100644 --- a/arch/x86/um/stub_32.S +++ b/arch/x86/um/stub_32.S @@ -5,21 +5,22 @@ .globl batch_syscall_stub batch_syscall_stub: - /* load pointer to first operation */ - mov $(STUB_DATA+8), %esp - + /* %esp comes in as "top of page" */ + mov %esp, %ecx + /* %esp has pointer to first operation */ + add $8, %esp again: /* load length of additional data */ mov 0x0(%esp), %eax /* if(length == 0) : end of list */ /* write possible 0 to header */ - mov %eax, STUB_DATA+4 + mov %eax, 0x4(%ecx) cmpl $0, %eax jz done /* save current pointer */ - mov %esp, STUB_DATA+4 + mov %esp, 0x4(%ecx) /* skip additional data */ add %eax, %esp @@ -38,6 +39,10 @@ again: /* execute syscall */ int $0x80 + /* restore top of page pointer in %ecx */ + mov %esp, %ecx + andl $(~UM_KERN_PAGE_SIZE) + 1, %ecx + /* check return value */ pop %ebx cmp %ebx, %eax @@ -45,7 +50,7 @@ again: done: /* save return value */ - mov %eax, STUB_DATA + mov %eax, (%ecx) /* stop */ int3 diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index 8a95c5b2eaf9..f3404640197a 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -4,9 +4,8 @@ .section .__syscall_stub, "ax" .globl batch_syscall_stub batch_syscall_stub: - mov $(STUB_DATA), %rbx - /* load pointer to first operation */ - mov %rbx, %rsp + /* %rsp has the pointer to first operation */ + mov %rsp, %rbx add $0x10, %rsp again: /* load length of additional data */ diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 27361cbb7ca9..21836eaf1725 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -11,10 +11,11 @@ void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { + int stack; ucontext_t *uc = p; + struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1)); - GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), - &uc->uc_mcontext); + GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext); trap_myself(); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 218acbd5c7a0..afc1da68b06d 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,6 +26,19 @@ config XEN_PV help Support running as a Xen PV guest. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN_PV + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". + config XEN_PV_SMP def_bool y depends on XEN_PV && SMP @@ -39,28 +52,19 @@ config XEN_DOM0 Support running as a Xen PV Dom0 guest. config XEN_PVHVM - bool "Xen PVHVM guest support" - default y - depends on XEN && PCI && X86_LOCAL_APIC - help - Support running as a Xen PVHVM guest. + def_bool y + depends on XEN && X86_LOCAL_APIC config XEN_PVHVM_SMP def_bool y depends on XEN_PVHVM && SMP -config XEN_512GB - bool "Limit Xen pv-domain memory to 512GB" - depends on XEN_PV +config XEN_PVHVM_GUEST + bool "Xen PVHVM guest support" default y + depends on XEN_PVHVM && PCI help - Limit paravirtualized user domains to 512GB of RAM. - - The Xen tools and crash dump analysis tools might not support - pv-domains with more than 512 GB of RAM. This option controls the - default setting of the kernel to use only up to 512 GB or more. - It is always possible to change the default via specifying the - boot parameter "xen_512gb_limit". + Support running as a Xen PVHVM guest. config XEN_SAVE_RESTORE bool @@ -76,7 +80,9 @@ config XEN_DEBUG_FS Enabling this option may incur a significant performance overhead. config XEN_PVH - bool "Support for running as a Xen PVH guest" + bool "Xen PVH guest support" depends on XEN && XEN_PVHVM && ACPI select PVH def_bool n + help + Support for running as a Xen PVH guest. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index fc5c5ba4aacb..40b5779fce21 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_xen-asm.o := y ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index e82fd1910dae..0d46cc283cf5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -148,15 +148,12 @@ static struct apic xen_pv_apic = { .apic_id_valid = xen_id_always_valid, .apic_id_registered = xen_id_always_registered, - /* .irq_delivery_mode - used in native_compose_msi_msg only */ - /* .irq_dest_mode - used in native_compose_msi_msg only */ + /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, - /* .dest_logical - default_send_IPI_ use it but we use our own. */ - .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ + .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 205a9bc981b0..7d7ffb9c826a 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -93,37 +93,22 @@ static efi_system_table_t __init *xen_efi_probe(void) /* * Determine whether we're in secure boot mode. - * - * Please keep the logic in sync with - * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot(). */ static enum efi_secureboot_mode xen_efi_get_secureboot(void) { - static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; + enum efi_secureboot_mode mode; efi_status_t status; - u8 moksbstate, secboot, setupmode; + u8 moksbstate; unsigned long size; - size = sizeof(secboot); - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - - if (status == EFI_NOT_FOUND) - return efi_secureboot_mode_disabled; - - if (status != EFI_SUCCESS) - goto out_efi_err; - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) - goto out_efi_err; - - if (secboot == 0 || setupmode == 1) - return efi_secureboot_mode_disabled; + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_unknown) { + pr_err("Could not determine UEFI Secure Boot status.\n"); + return efi_secureboot_mode_unknown; + } + if (mode != efi_secureboot_mode_enabled) + return mode; /* See if a user has put the shim into insecure mode. */ size = sizeof(moksbstate); @@ -140,10 +125,6 @@ static enum efi_secureboot_mode xen_efi_get_secureboot(void) secure_boot_enabled: pr_info("UEFI Secure Boot is enabled.\n"); return efi_secureboot_mode_enabled; - - out_efi_err: - pr_err("Could not determine UEFI Secure Boot status.\n"); - return efi_secureboot_mode_unknown; } void __init xen_efi_init(struct boot_params *boot_params) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 9e87ab010c82..e68ea5f4ad1c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -164,10 +164,10 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) else per_cpu(xen_vcpu_id, cpu) = cpu; rc = xen_vcpu_setup(cpu); - if (rc) + if (rc || !xen_have_vector_callback) return rc; - if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); @@ -188,6 +188,8 @@ static int xen_cpu_dead_hvm(unsigned int cpu) return 0; } +static bool no_vector_callback __initdata; + static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) @@ -207,7 +209,7 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (xen_feature(XENFEAT_hvm_callback_vector)) + if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -233,6 +235,13 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); +static __init int xen_parse_no_vector_callback(char *arg) +{ + no_vector_callback = true; + return 0; +} +early_param("xen_no_vector_callback", xen_parse_no_vector_callback); + bool __init xen_hvm_need_lapic(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4409306364dc..dc0a337f985b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs); DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) { - /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */ exc_nmi(regs); } +DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault) +{ + /* On Xen PV, DF doesn't use IST. The C part is the same as native. */ + exc_double_fault(regs, error_code); +} + DEFINE_IDTENTRY_RAW(xenpv_exc_debug) { /* @@ -583,6 +589,27 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug) exc_debug(regs); } +DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap) +{ + /* This should never happen and there is no way to handle it. */ + pr_err("Unknown trap in Xen PV mode."); + BUG(); +} + +#ifdef CONFIG_X86_MCE +DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_machine_check(regs); + else + exc_machine_check(regs); +} +#endif + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -601,9 +628,9 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { TRAP_ENTRY_REDIR(exc_debug, true ), - TRAP_ENTRY(exc_double_fault, true ), + TRAP_ENTRY_REDIR(exc_double_fault, true ), #ifdef CONFIG_X86_MCE - TRAP_ENTRY(exc_machine_check, true ), + TRAP_ENTRY_REDIR(exc_machine_check, true ), #endif TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), @@ -631,6 +658,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) { unsigned int nr; bool ist_okay = false; + bool found = false; /* * Replace trap handler addresses by Xen specific ones. @@ -645,6 +673,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) if (*addr == entry->orig) { *addr = entry->xen; ist_okay = entry->ist_okay; + found = true; break; } } @@ -655,9 +684,13 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) nr = (*addr - (void *)early_idt_handler_array[0]) / EARLY_IDT_HANDLER_SIZE; *addr = (void *)xen_early_idt_handler_array[nr]; + found = true; } - if (WARN_ON(ist != 0 && !ist_okay)) + if (!found) + *addr = (void *)xen_asm_exc_xen_unknown_trap; + + if (WARN_ON(found && ist != 0 && !ist_okay)) return false; return true; @@ -1002,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void) */ if (xen_have_vcpu_info_placement) { pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_ops.irq.restore_fl = - __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); pv_ops.irq.irq_enable = @@ -1040,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, .iret = xen_iret, - .usergs_sysret64 = xen_sysret64, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -1065,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - /* Xen takes care of %gs when switching to usermode for us */ - .swapgs = paravirt_nop, - .start_context_switch = paravirt_start_context_switch, .end_context_switch = xen_end_context_switch, }; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 850c93f346c7..dfa091d79c2e 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -__visible void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* See xen_irq_enable() for why preemption must be disabled. */ - preempt_disable(); - vcpu = this_cpu_read(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - - if (flags == 0) { - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - xen_force_evtchn_callback(); - preempt_enable(); - } else - preempt_enable_no_resched(); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); - asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to @@ -118,7 +96,6 @@ static void xen_halt(void) static const struct pv_irq_ops xen_irq_ops __initconst = { .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index be4151f42611..ac06ca32e9ef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT -#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT #else #define P2M_LIMIT 0 #endif @@ -652,10 +652,9 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) pte_t *ptep; unsigned int level; - if (unlikely(pfn >= xen_p2m_size)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return true; - } + /* Only invalid entries allowed above the highest p2m covered frame. */ + if (unlikely(pfn >= xen_p2m_size)) + return mfn == INVALID_P2M_ENTRY; /* * The interface requires atomic updates on p2m elements. @@ -710,9 +709,12 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, for (i = 0; i < count; i++) { unsigned long mfn, pfn; + struct gnttab_unmap_grant_ref unmap[2]; + int rc; /* Do not add to override if the map failed. */ - if (map_ops[i].status) + if (map_ops[i].status != GNTST_okay || + (kmap_ops && kmap_ops[i].status != GNTST_okay)) continue; if (map_ops[i].flags & GNTMAP_contains_pte) { @@ -726,16 +728,51 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned"); - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; + if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + continue; + + /* + * Signal an error for this slot. This in turn requires + * immediate unmapping. + */ + map_ops[i].status = GNTST_general_error; + unmap[0].host_addr = map_ops[i].host_addr, + unmap[0].handle = map_ops[i].handle; + map_ops[i].handle = INVALID_GRANT_HANDLE; + if (map_ops[i].flags & GNTMAP_device_map) + unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; + else + unmap[0].dev_bus_addr = 0; + + if (kmap_ops) { + kmap_ops[i].status = GNTST_general_error; + unmap[1].host_addr = kmap_ops[i].host_addr, + unmap[1].handle = kmap_ops[i].handle; + kmap_ops[i].handle = INVALID_GRANT_HANDLE; + if (kmap_ops[i].flags & GNTMAP_device_map) + unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; + else + unmap[1].dev_bus_addr = 0; } + + /* + * Pre-populate both status fields, to be recognizable in + * the log message below. + */ + unmap[0].status = 1; + unmap[1].status = 1; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, 1 + !!kmap_ops); + if (rc || unmap[0].status != GNTST_okay || + unmap[1].status != GNTST_okay) + pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n", + rc, unmap[0].status, unmap[1].status); } out: return ret; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -750,20 +787,17 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT)) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + else ret = -EINVAL; - goto out; - } - - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } if (kunmap_ops) ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - kunmap_ops, count); -out: + kunmap_ops, count) ?: ret; + return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> @@ -795,17 +829,7 @@ static int p2m_dump_show(struct seq_file *m, void *v) return 0; } -static int p2m_dump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, p2m_dump_show, NULL); -} - -static const struct file_operations p2m_dump_fops = { - .open = p2m_dump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(p2m_dump); static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7eab14d56369..8bfc10330107 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -59,13 +59,13 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; -/* +/* * The maximum amount of extra memory compared to the base size. The * main scaling factor is the size of struct page. At extreme ratios * of base:extra, all the base memory can be filled with page * structures for the extra memory, leaving no space for anything * else. - * + * * 10x seems like a reasonable balance between scaling flexibility and * leaving a practically usable system. */ @@ -791,17 +791,10 @@ char * __init xen_memory_setup(void) /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO - * factor the base size. On non-highmem systems, the base - * size is the full initial memory allocation; on highmem it - * is limited to the max size of lowmem, so that it doesn't - * get completely filled. + * factor the base size. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. - * - * In principle there could be a problem in lowmem systems if - * the initial memory is also very large with respect to - * lowmem, but we won't try to deal with that here. */ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f5e7db4f82ab..6ff3c887e0b9 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -33,9 +33,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) int cpu; native_smp_prepare_cpus(max_cpus); - WARN_ON(xen_smp_intr_init(0)); - xen_init_lock_cpu(0); + if (xen_have_vector_callback) { + WARN_ON(xen_smp_intr_init(0)); + xen_init_lock_cpu(0); + } for_each_possible_cpu(cpu) { if (cpu == 0) @@ -50,9 +52,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static void xen_hvm_cpu_die(unsigned int cpu) { if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } } } #else @@ -64,14 +68,19 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { - if (!xen_have_vector_callback) + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_cpus_done = xen_smp_cpus_done; + smp_ops.cpu_die = xen_hvm_cpu_die; + + if (!xen_have_vector_callback) { +#ifdef CONFIG_PARAVIRT_SPINLOCKS + nopvspin = true; +#endif return; + } - smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; - smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; - smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; - smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1cb0e84b9161..1e626444712b 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -14,6 +14,7 @@ #include <asm/thread_info.h> #include <asm/asm.h> #include <asm/frame.h> +#include <asm/unwind_hints.h> #include <xen/interface/xen.h> @@ -72,34 +73,6 @@ SYM_FUNC_START(xen_save_fl_direct) ret SYM_FUNC_END(xen_save_fl_direct) - -/* - * In principle the caller should be passing us a value return from - * xen_save_fl_direct, but for robustness sake we test only the - * X86_EFLAGS_IF flag rather than the whole byte. After setting the - * interrupt mask state, it checks for unmasked pending events and - * enters the hypervisor to get them delivered if so. - */ -SYM_FUNC_START(xen_restore_fl_direct) - FRAME_BEGIN - testw $X86_EFLAGS_IF, %di - setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - /* - * Preempt here doesn't matter because that will deal with any - * pending interrupts. The pending check may end up being run - * on the wrong CPU, but that doesn't hurt. - */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jnz 1f - call check_events -1: - FRAME_END - ret -SYM_FUNC_END(xen_restore_fl_direct) - - /* * Force an event check by making a hypercall, but preserve regs * before making the call. @@ -146,6 +119,7 @@ SYM_FUNC_END(xen_read_cr2_direct); .macro xen_pv_trap name SYM_CODE_START(xen_\name) + UNWIND_HINT_EMPTY pop %rcx pop %r11 jmp \name @@ -161,7 +135,7 @@ xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available -xen_pv_trap asm_exc_double_fault +xen_pv_trap asm_xenpv_exc_double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present @@ -172,18 +146,20 @@ xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE -xen_pv_trap asm_exc_machine_check +xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION xen_pv_trap entry_INT80_compat #endif +xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback __INIT SYM_CODE_START(xen_early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS + UNWIND_HINT_EMPTY pop %rcx pop %r11 jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE @@ -210,30 +186,11 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * rsp->rax } */ SYM_CODE_START(xen_iret) + UNWIND_HINT_EMPTY pushq $0 jmp hypercall_iret SYM_CODE_END(xen_iret) -SYM_CODE_START(xen_sysret64) - /* - * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back. - * - * tss.sp2 is scratch space. - */ - movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - pushq $__USER_DS - pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - pushq %r11 - pushq $__USER_CS - pushq %rcx - - pushq $VGCF_in_syscall - jmp hypercall_iret -SYM_CODE_END(xen_sysret64) - /* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: @@ -250,7 +207,8 @@ SYM_CODE_END(xen_sysret64) */ /* Normal 64-bit system call target */ -SYM_FUNC_START(xen_syscall_target) +SYM_CODE_START(xen_syscall_target) + UNWIND_HINT_EMPTY popq %rcx popq %r11 @@ -263,12 +221,13 @@ SYM_FUNC_START(xen_syscall_target) movq $__USER_CS, 1*8(%rsp) jmp entry_SYSCALL_64_after_hwframe -SYM_FUNC_END(xen_syscall_target) +SYM_CODE_END(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ -SYM_FUNC_START(xen_syscall32_target) +SYM_CODE_START(xen_syscall32_target) + UNWIND_HINT_EMPTY popq %rcx popq %r11 @@ -281,10 +240,11 @@ SYM_FUNC_START(xen_syscall32_target) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe -SYM_FUNC_END(xen_syscall32_target) +SYM_CODE_END(xen_syscall32_target) /* 32-bit compat sysenter target */ -SYM_FUNC_START(xen_sysenter_target) +SYM_CODE_START(xen_sysenter_target) + UNWIND_HINT_EMPTY /* * NB: Xen is polite and clears TF from EFLAGS for us. This means * that we don't need to guard against single step exceptions here. @@ -301,17 +261,18 @@ SYM_FUNC_START(xen_sysenter_target) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe -SYM_FUNC_END(xen_sysenter_target) +SYM_CODE_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ -SYM_FUNC_START_ALIAS(xen_syscall32_target) -SYM_FUNC_START(xen_sysenter_target) +SYM_CODE_START(xen_syscall32_target) +SYM_CODE_START(xen_sysenter_target) + UNWIND_HINT_EMPTY lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax pushq $0 jmp hypercall_iret -SYM_FUNC_END(xen_sysenter_target) -SYM_FUNC_END_ALIAS(xen_syscall32_target) +SYM_CODE_END(xen_sysenter_target) +SYM_CODE_END(xen_syscall32_target) #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 2d7c8f34f56c..cb6538ae2fe0 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -68,8 +68,9 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) .balign PAGE_SIZE SYM_CODE_START(hypercall_page) .rept (PAGE_SIZE / 32) - UNWIND_HINT_EMPTY - .skip 32 + UNWIND_HINT_FUNC + .skip 31, 0x90 + ret .endr #define HYPERCALL(n) \ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9546c3384c75..8d7ec49a35fb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params) __visible void xen_irq_enable_direct(void); __visible void xen_irq_disable_direct(void); __visible unsigned long xen_save_fl_direct(void); -__visible void xen_restore_fl_direct(unsigned long); __visible unsigned long xen_read_cr2(void); __visible unsigned long xen_read_cr2_direct(void); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -__visible void xen_sysret32(void); -__visible void xen_sysret64(void); extern int xen_panic_handler_init(void); |