diff options
Diffstat (limited to 'arch/x86')
262 files changed, 6107 insertions, 3841 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a0be022f91d..cbd5f28ea8e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -48,6 +48,7 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ANON_INODES select ARCH_CLOCKSOURCE_DATA + select ARCH_CLOCKSOURCE_INIT select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL @@ -119,6 +120,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU @@ -447,7 +449,6 @@ config RETPOLINE config INTEL_RDT bool "Intel Resource Director Technology support" - default n depends on X86 && CPU_SUP_INTEL select KERNFS help @@ -523,6 +524,7 @@ config X86_VSMP bool "ScaleMP vSMP" select HYPERVISOR_GUEST select PARAVIRT + select PARAVIRT_XXL depends on X86_64 && PCI depends on X86_EXTENDED_PLATFORM depends on SMP @@ -701,7 +703,6 @@ config STA2X11 select SWIOTLB select MFD_STA2X11 select GPIOLIB - default n ---help--- This adds support for boards based on the STA2X11 IO-Hub, a.k.a. "ConneXt". The chip is used in place of the standard @@ -754,6 +755,9 @@ config PARAVIRT over full virtualization. However, when run without a hypervisor the kernel is theoretically slower and slightly larger. +config PARAVIRT_XXL + bool + config PARAVIRT_DEBUG bool "paravirt-ops debugging" depends on PARAVIRT && DEBUG_KERNEL @@ -799,7 +803,6 @@ config KVM_GUEST config KVM_DEBUG_FS bool "Enable debug information for KVM Guests in debugfs" depends on KVM_GUEST && DEBUG_FS - default n ---help--- This option enables collection of various statistics for KVM guest. Statistics are displayed in debugfs filesystem. Enabling this option @@ -808,7 +811,6 @@ config KVM_DEBUG_FS config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT - default n ---help--- Select this option to enable fine granularity task steal time accounting. Time spent executing other tasks in parallel with @@ -1168,7 +1170,6 @@ source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 bool "Legacy VM86 support" - default n depends on X86_32 ---help--- This option allows user programs to put the CPU into V8086 @@ -1491,6 +1492,14 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config X86_CPA_STATISTICS + bool "Enable statistic for Change Page Attribute" + depends on DEBUG_FS + ---help--- + Expose statistics about the Change Page Attribute mechanims, which + helps to determine the effectivness of preserving large and huge + page mappings when mapping protections are changed. + config ARCH_HAS_MEM_ENCRYPT def_bool y @@ -2220,7 +2229,6 @@ config HOTPLUG_CPU config BOOTPARAM_HOTPLUG_CPU0 bool "Set default setting of cpu0_hotpluggable" - default n depends on HOTPLUG_CPU ---help--- Set whether default state of cpu0_hotpluggable is on or off. @@ -2422,7 +2430,7 @@ menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER def_bool y - depends on X86_64 && HIBERNATION + depends on HIBERNATION source "kernel/power/Kconfig" @@ -2742,8 +2750,7 @@ config OLPC config OLPC_XO1_PM bool "OLPC XO-1 Power Management" - depends on OLPC && MFD_CS5535 && PM_SLEEP - select MFD_CORE + depends on OLPC && MFD_CS5535=y && PM_SLEEP ---help--- Add support for poweroff and suspend of the OLPC XO-1 laptop. @@ -2825,7 +2832,6 @@ source "drivers/pcmcia/Kconfig" config RAPIDIO tristate "RapidIO support" depends on PCI - default n help If enabled this option will include drivers and the core infrastructure code to support RapidIO interconnect devices. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 638411f22267..6adce15268bd 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -426,6 +426,20 @@ config CPU_SUP_AMD If unsure, say N. +config CPU_SUP_HYGON + default y + bool "Support Hygon processors" if PROCESSOR_SELECT + select CPU_SUP_AMD + help + This enables detection, tunings and quirks for Hygon processors + + You need this enabled if you want your kernel to run on an + Hygon CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on an Hygon + CPU might render the kernel unbootable. + + If unsure, say N. + config CPU_SUP_CENTAUR default y bool "Support Centaur processors" if PROCESSOR_SELECT diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7d68f0c7cfb1..0723dff17e6c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -314,7 +314,6 @@ config DEBUG_NMI_SELFTEST config DEBUG_IMR_SELFTEST bool "Isolated Memory Region self test" - default n depends on INTEL_IMR ---help--- This option enables automated sanity testing of the IMR code. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8f6e7eb8ae9f..5b562e464009 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -193,7 +193,6 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) -asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1) @@ -237,6 +236,13 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all +archmacros: + $(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s + +ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s -Wa,- +export ASM_MACRO_FLAGS +KBUILD_CFLAGS += $(ASM_MACRO_FLAGS) + ### # Kernel objects diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 28764dacf018..466f66c8a7f8 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -37,6 +37,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1458b1700fc7..8b4c5e001157 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -738,6 +738,7 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) struct desc_struct *desc; void *handle; efi_system_table_t *_table; + unsigned long cmdline_paddr; efi_early = c; @@ -756,6 +757,15 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) setup_boot_services32(efi_early); /* + * make_boot_params() may have been called before efi_main(), in which + * case this is the second time we parse the cmdline. This is ok, + * parsing the cmdline multiple times does not have side-effects. + */ + cmdline_paddr = ((u64)hdr->cmd_line_ptr | + ((u64)boot_params->ext_cmd_line_ptr << 32)); + efi_parse_options((char *)cmdline_paddr); + + /* * If the boot loader gave us a value for secure_boot then we use that, * otherwise we ask the BIOS. */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index d1e19f358b6e..9ed9709d9947 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -241,7 +241,7 @@ static void parse_gb_huge_pages(char *param, char *val) } -static int handle_mem_options(void) +static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); size_t len = strlen((char *)args); @@ -251,7 +251,7 @@ static int handle_mem_options(void) if (!strstr(args, "memmap=") && !strstr(args, "mem=") && !strstr(args, "hugepages")) - return 0; + return; tmp_cmdline = malloc(len + 1); if (!tmp_cmdline) @@ -269,8 +269,7 @@ static int handle_mem_options(void) /* Stop at -- */ if (!val && strcmp(param, "--") == 0) { warn("Only '--' specified in cmdline"); - free(tmp_cmdline); - return -1; + goto out; } if (!strcmp(param, "memmap")) { @@ -283,16 +282,16 @@ static int handle_mem_options(void) if (!strcmp(p, "nopentium")) continue; mem_size = memparse(p, &p); - if (mem_size == 0) { - free(tmp_cmdline); - return -EINVAL; - } + if (mem_size == 0) + goto out; + mem_limit = mem_size; } } +out: free(tmp_cmdline); - return 0; + return; } /* @@ -578,7 +577,6 @@ static void process_mem_region(struct mem_vector *entry, unsigned long image_size) { struct mem_vector region, overlap; - struct slot_area slot_area; unsigned long start_orig, end; struct mem_vector cur_entry; diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index eaa843a52907..a480356e0ed8 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -25,20 +25,6 @@ ENTRY(get_sev_encryption_bit) push %ebx push %ecx push %edx - push %edi - - /* - * RIP-relative addressing is needed to access the encryption bit - * variable. Since we are running in 32-bit mode we need this call/pop - * sequence to get the proper relative addressing. - */ - call 1f -1: popl %edi - subl $1b, %edi - - movl enc_bit(%edi), %eax - cmpl $0, %eax - jge .Lsev_exit /* Check if running under a hypervisor */ movl $1, %eax @@ -69,15 +55,12 @@ ENTRY(get_sev_encryption_bit) movl %ebx, %eax andl $0x3f, %eax /* Return the encryption bit location */ - movl %eax, enc_bit(%edi) jmp .Lsev_exit .Lno_sev: xor %eax, %eax - movl %eax, enc_bit(%edi) .Lsev_exit: - pop %edi pop %edx pop %ecx pop %ebx @@ -113,8 +96,6 @@ ENTRY(set_sev_encryption_mask) ENDPROC(set_sev_encryption_mask) .data -enc_bit: - .int 0xffffffff #ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a423bdb42686..a1d5918765f3 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -9,6 +9,7 @@ * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 850b8762e889..4c881c850125 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -300,7 +300,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020d # header version number (>= 0x0105) + .word 0x020e # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -558,6 +558,10 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c +acpi_rsdp_addr: .quad 0 # 64-bit physical pointer to the + # ACPI RSDP table, added with + # version 2.14 + # End of setup header ##################################################### .section ".entrytext", "ax" diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index d4e6cd4577e5..bf0e82400358 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -391,6 +391,13 @@ int main(int argc, char ** argv) die("Unable to mmap '%s': %m", argv[2]); /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ sys_size = (sz + 15 + 4) / 16; +#ifdef CONFIG_EFI_STUB + /* + * COFF requires minimum 32-byte alignment of sections, and + * adding a signature is problematic without that alignment. + */ + sys_size = (sys_size + 1) & ~1; +#endif /* Patch the setup code with the appropriate size parameters */ buf[0x1f1] = setup_sectors-1; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 0eb9f92f3717..6c3ab05c231d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -247,6 +247,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e32fc1f274d8..ac9ae487cfeb 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -243,6 +243,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index acd11b3bf639..2a356b948720 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 2071c3d1ae07..dbe8bb980da1 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index b5f2a8fd5a71..8bebda2de92f 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 95cf857d2cbb..f40244eaf14d 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); static int __init crypto_morus1280_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 615fb7bc9a32..9afaf8f8565a 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); static int __init crypto_morus640_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 352e70cd33e8..708b46a54578 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -338,7 +338,7 @@ For 32-bit we have the following conventions - kernel is built with .macro CALL_enter_from_user_mode #ifdef CONFIG_CONTEXT_TRACKING #ifdef HAVE_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 + STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1 #endif call enter_from_user_mode .Lafter_call_\@: diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2767c625a52c..687e47f8a796 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -389,6 +389,13 @@ * that register for the time this macro runs */ + /* + * The high bits of the CS dword (__csh) are used for + * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case + * hardware didn't do this for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -407,12 +414,6 @@ /* Load top of task-stack into %edi */ movl TSS_entry2task_stack(%edi), %edi - /* - * Clear unused upper bits of the dword containing the word-sized CS - * slot in pt_regs in case hardware didn't clear it for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Special case - entry from kernel mode via entry stack */ #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS @@ -782,7 +783,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) * will ignore all of the single-step traps generated in this range. */ -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * Xen doesn't set %esp to be precisely what the normal SYSENTER * entry point expects, so fix it up before using the normal path. @@ -1240,7 +1241,7 @@ ENTRY(spurious_interrupt_bug) jmp common_exception END(spurious_interrupt_bug) -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL @@ -1321,11 +1322,13 @@ ENTRY(xen_failsafe_callback) _ASM_EXTABLE(3b, 8b) _ASM_EXTABLE(4b, 9b) ENDPROC(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ +#ifdef CONFIG_XEN_PVHVM BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) +#endif -#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 957dfb693ecc..4d7a2d9d44cf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - -ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY - swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi -END(entry_SYSCALL_64_trampoline) - - .popsection - -ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe -END(entry_SYSCALL_64_stage2) - ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -212,21 +151,19 @@ ENTRY(entry_SYSCALL_64) */ swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + /* tss.sp2 is scratch space. */ + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +/** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -1050,7 +1023,7 @@ ENTRY(do_softirq_own_stack) ret ENDPROC(do_softirq_own_stack) -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* @@ -1130,11 +1103,13 @@ ENTRY(xen_failsafe_callback) ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ +#ifdef CONFIG_XEN_PVHVM apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall +#endif -#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1151,7 +1126,7 @@ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 @@ -1187,6 +1162,16 @@ ENTRY(paranoid_entry) xorl %ebx, %ebx 1: + /* + * Always stash CR3 in %r14. This value will be restored, + * verbatim, at exit. Needed if paranoid_entry interrupted + * another entry that already switched to the user CR3 value + * but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ret @@ -1211,11 +1196,13 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel @@ -1626,6 +1613,7 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 testl %ebx, %ebx /* swapgs needed? */ diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index fa3f439f0a92..141d415a8c80 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) @@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d95c60..007b3fe9d727 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -43,50 +43,26 @@ extern u8 hvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "rcx", "r11"); return ret; } -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} - - #else notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) - : "memory", "edx"); + : "=a" (ret), "=m" (*ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) + : "edx"); return ret; } @@ -98,12 +74,11 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } -static notrace u64 vread_pvclock(int *mode) +static notrace u64 vread_pvclock(void) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - u64 ret; - u64 last; u32 version; + u64 ret; /* * Note: The kernel and hypervisor must guarantee that cpu ID @@ -130,175 +105,112 @@ static notrace u64 vread_pvclock(int *mode) do { version = pvclock_read_begin(pvti); - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { - *mode = VCLOCK_NONE; - return 0; - } + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) + return U64_MAX; ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - /* refer to vread_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; + return ret; } #endif #ifdef CONFIG_HYPERV_TSCPAGE -static notrace u64 vread_hvclock(int *mode) +static notrace u64 vread_hvclock(void) { const struct ms_hyperv_tsc_page *tsc_pg = (const struct ms_hyperv_tsc_page *)&hvclock_page; - u64 current_tick = hv_read_tsc_page(tsc_pg); - - if (current_tick != U64_MAX) - return current_tick; - *mode = VCLOCK_NONE; - return 0; + return hv_read_tsc_page(tsc_pg); } #endif -notrace static u64 vread_tsc(void) -{ - u64 ret = (u64)rdtsc_ordered(); - u64 last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -notrace static inline u64 vgetsns(int *mode) +notrace static inline u64 vgetcyc(int mode) { - u64 v; - cycles_t cycles; - - if (gtod->vclock_mode == VCLOCK_TSC) - cycles = vread_tsc(); + if (mode == VCLOCK_TSC) + return (u64)rdtsc_ordered(); #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) - cycles = vread_pvclock(mode); + else if (mode == VCLOCK_PVCLOCK) + return vread_pvclock(); #endif #ifdef CONFIG_HYPERV_TSCPAGE - else if (gtod->vclock_mode == VCLOCK_HVCLOCK) - cycles = vread_hvclock(mode); + else if (mode == VCLOCK_HVCLOCK) + return vread_hvclock(); #endif - else - return 0; - v = (cycles - gtod->cycle_last) & gtod->mask; - return v * gtod->mult; + return U64_MAX; } -/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ -notrace static int __always_inline do_realtime(struct timespec *ts) +notrace static int do_hres(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = >od->basetime[clk]; + u64 cycles, last, sec, ns; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->wall_time_snsec; - ns += vgetsns(&mode); + cycles = vgetcyc(gtod->vclock_mode); + ns = base->nsec; + last = gtod->cycle_last; + if (unlikely((s64)cycles < 0)) + return vdso_fallback_gettime(clk, ts); + if (cycles > last) + ns += (cycles - last) * gtod->mult; ns >>= gtod->shift; + sec = base->sec; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; - return mode; + return 0; } -notrace static int __always_inline do_monotonic(struct timespec *ts) +notrace static void do_coarse(clockid_t clk, struct timespec *ts) { - unsigned long seq; - u64 ns; - int mode; + struct vgtod_ts *base = >od->basetime[clk]; + unsigned int seq; do { seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; + ts->tv_sec = base->sec; + ts->tv_nsec = base->nsec; } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; } -notrace static void do_realtime_coarse(struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->wall_time_coarse_sec; - ts->tv_nsec = gtod->wall_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + unsigned int msk; -notrace static void do_monotonic_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} + /* Sort out negative (CPU/FD) and invalid clocks */ + if (unlikely((unsigned int) clock >= MAX_CLOCKS)) + return vdso_fallback_gettime(clock, ts); -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - switch (clock) { - case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_MONOTONIC: - if (do_monotonic(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_REALTIME_COARSE: - do_realtime_coarse(ts); - break; - case CLOCK_MONOTONIC_COARSE: - do_monotonic_coarse(ts); - break; - default: - goto fallback; + /* + * Convert the clockid to a bitmask and use it to check which + * clocks are handled in the VDSO directly. + */ + msk = 1U << clock; + if (likely(msk & VGTOD_HRES)) { + return do_hres(clock, ts); + } else if (msk & VGTOD_COARSE) { + do_coarse(clock, ts); + return 0; } - - return 0; -fallback: return vdso_fallback_gettime(clock, ts); } + int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { if (likely(tv != NULL)) { - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) - return vdso_fallback_gtod(tv, tz); + struct timespec *ts = (struct timespec *) tv; + + do_hres(CLOCK_REALTIME, ts); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { @@ -318,7 +230,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86 so we don't need any locks. */ - time_t result = READ_ONCE(gtod->wall_time_sec); + time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); if (t) *t = result; diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index 8ec3d1f4ce9a..f86ab0ae1777 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -13,14 +13,8 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int p; + vdso_read_cpunode(cpu, node); - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; return 0; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b8b556dbb12..3f9d43f26f63 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -332,40 +332,6 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (static_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit1 = node >> 4; - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static int vgetcpu_online(unsigned int cpu) -{ - return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); -} static int __init init_vdso(void) { @@ -375,9 +341,7 @@ static int __init init_vdso(void) init_vdso_image(&vdso_image_x32); #endif - /* notifier priority > KVM */ - return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "x86/vdso/vma:online", vgetcpu_online, NULL); + return 0; } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 82ed001e8909..85fd85d52ffd 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -100,20 +100,13 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) */ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; struct thread_struct *thread = ¤t->thread; thread->error_code = 6; /* user fault, no page, write */ thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; - clear_siginfo(&info); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current); return false; } else { return true; diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index e1216dd95c04..cfcdba082feb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -31,6 +31,8 @@ void update_vsyscall(struct timekeeper *tk) { int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + struct vgtod_ts *base; + u64 nsec; /* Mark the new vclock used. */ BUILD_BUG_ON(VCLOCK_MAX >= 32); @@ -45,34 +47,37 @@ void update_vsyscall(struct timekeeper *tk) vdata->mult = tk->tkr_mono.mult; vdata->shift = tk->tkr_mono.shift; - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + base = &vdata->basetime[CLOCK_REALTIME]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } + base = &vdata->basetime[CLOCK_TAI]; + base->sec = tk->xtime_sec + (s64)tk->tai_offset; + base->nsec = tk->tkr_mono.xtime_nsec; - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); + base = &vdata->basetime[CLOCK_MONOTONIC]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + base->sec++; + } + base->nsec = nsec; - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + base = &vdata->basetime[CLOCK_REALTIME_COARSE]; + base->sec = tk->xtime_sec; + base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; + base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->wall_to_monotonic.tv_nsec; + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + base->sec++; } + base->nsec = nsec; gtod_write_end(vdata); } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c84584bb9402..7d2d7c801dba 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -669,6 +669,10 @@ static int __init amd_core_pmu_init(void) * We fallback to using default amd_get_event_constraints. */ break; + case 0x18: + pr_cont("Fam18h "); + /* Using default amd_get_event_constraints. */ + break; default: pr_err("core perfctr but no constraints; unknown hardware!\n"); return -ENODEV; diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 981ba5e8241b..398df6eaa109 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -36,6 +36,7 @@ static int num_counters_llc; static int num_counters_nb; +static bool l3_mask; static HLIST_HEAD(uncore_unused_list); @@ -209,6 +210,13 @@ static int amd_uncore_event_init(struct perf_event *event) hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; + /* + * SliceMask and ThreadMask need to be set for certain L3 events in + * Family 17h. For other events, the two fields do not affect the count. + */ + if (l3_mask) + hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); + if (event->cpu < 0) return -EINVAL; @@ -507,17 +515,19 @@ static int __init amd_uncore_init(void) { int ret = -ENODEV; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return -ENODEV; if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; - if (boot_cpu_data.x86 == 0x17) { + if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) { /* - * For F17h, the Northbridge counters are repurposed as Data - * Fabric counters. Also, L3 counters are supported too. The PMUs - * are exported based on family as either L2 or L3 and NB or DF. + * For F17h or F18h, the Northbridge counters are + * repurposed as Data Fabric counters. Also, L3 + * counters are supported too. The PMUs are exported + * based on family as either L2 or L3 and NB or DF. */ num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L3; @@ -525,6 +535,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l3"; format_attr_event_df.show = &event_show_df; format_attr_event_l3.show = &event_show_l3; + l3_mask = true; } else { num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L2; @@ -532,6 +543,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l2"; format_attr_event_df = format_attr_event; format_attr_event_l3 = format_attr_event; + l3_mask = false; } amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; @@ -547,7 +559,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("AMD NB counters detected\n"); + pr_info("%s NB counters detected\n", + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? + "HYGON" : "AMD"); ret = 0; } @@ -561,7 +575,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("AMD LLC counters detected\n"); + pr_info("%s LLC counters detected\n", + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? + "HYGON" : "AMD"); ret = 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index dfb2f7c0d019..106911b603bd 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1033,6 +1033,27 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +/** + * x86_perf_rdpmc_index - Return PMC counter used for event + * @event: the perf_event to which the PMC counter was assigned + * + * The counter assigned to this performance event may change if interrupts + * are enabled. This counter should thus never be used while interrupts are + * enabled. Before this function is used to obtain the assigned counter the + * event should be checked for validity using, for example, + * perf_event_read_local(), within the same interrupt disabled section in + * which this counter is planned to be used. + * + * Return: The index of the performance monitoring counter assigned to + * @perf_event. + */ +int x86_perf_rdpmc_index(struct perf_event *event) +{ + lockdep_assert_irqs_disabled(); + + return event->hw.event_base_rdpmc; +} + static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) @@ -1584,7 +1605,7 @@ static void __init pmu_check_apic(void) } -static struct attribute_group x86_pmu_format_group = { +static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; @@ -1631,9 +1652,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) struct attribute **new; int j, i; - for (j = 0; a[j]; j++) + for (j = 0; a && a[j]; j++) ; - for (i = 0; b[i]; i++) + for (i = 0; b && b[i]; i++) j++; j++; @@ -1642,9 +1663,9 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) return NULL; j = 0; - for (i = 0; a[i]; i++) + for (i = 0; a && a[i]; i++) new[j++] = a[i]; - for (i = 0; b[i]; i++) + for (i = 0; b && b[i]; i++) new[j++] = b[i]; new[j] = NULL; @@ -1715,7 +1736,7 @@ static struct attribute *events_attr[] = { NULL, }; -static struct attribute_group x86_pmu_events_group = { +static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, }; @@ -1776,6 +1797,10 @@ static int __init init_hw_perf_events(void) case X86_VENDOR_AMD: err = amd_pmu_init(); break; + case X86_VENDOR_HYGON: + err = amd_pmu_init(); + x86_pmu.name = "HYGON"; + break; default: err = -ENOTSUPP; } @@ -2230,7 +2255,7 @@ static struct attribute *x86_pmu_attrs[] = { NULL, }; -static struct attribute_group x86_pmu_attr_group = { +static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; @@ -2248,7 +2273,7 @@ static struct attribute *x86_pmu_caps_attrs[] = { NULL }; -static struct attribute_group x86_pmu_caps_group = { +static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 035c37481f57..0fb8659b20d8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -242,7 +242,7 @@ EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); -static struct attribute *nhm_events_attrs[] = { +static struct attribute *nhm_mem_events_attrs[] = { EVENT_PTR(mem_ld_nhm), NULL, }; @@ -278,8 +278,6 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); static struct attribute *snb_events_attrs[] = { - EVENT_PTR(mem_ld_snb), - EVENT_PTR(mem_st_snb), EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), EVENT_PTR(td_fetch_bubbles), @@ -290,6 +288,12 @@ static struct attribute *snb_events_attrs[] = { NULL, }; +static struct attribute *snb_mem_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), + NULL, +}; + static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -1995,6 +1999,18 @@ static void intel_pmu_nhm_enable_all(int added) intel_pmu_enable_all(added); } +static void enable_counter_freeze(void) +{ + update_debugctlmsr(get_debugctlmsr() | + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); +} + +static void disable_counter_freeze(void) +{ + update_debugctlmsr(get_debugctlmsr() & + ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); +} + static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2200,59 +2216,15 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) +static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 status; - int handled; - int pmu_enabled; - - cpuc = this_cpu_ptr(&cpu_hw_events); - - /* - * Save the PMU state. - * It needs to be restored when leaving the handler. - */ - pmu_enabled = cpuc->enabled; - /* - * No known reason to not always do late ACK, - * but just in case do it opt-in. - */ - if (!x86_pmu.late_ack) - apic_write(APIC_LVTPC, APIC_DM_NMI); - intel_bts_disable_local(); - cpuc->enabled = 0; - __intel_pmu_disable_all(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - status = intel_pmu_get_status(); - if (!status) - goto done; - - loops = 0; -again: - intel_pmu_lbr_read(); - intel_pmu_ack_status(status); - if (++loops > 100) { - static bool warned = false; - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int bit; + int handled = 0; inc_irq_stat(apic_perf_irqs); - /* * Ignore a range of extra bits in status that do not indicate * overflow by themselves. @@ -2261,7 +2233,7 @@ again: GLOBAL_STATUS_ASIF | GLOBAL_STATUS_LBRS_FROZEN); if (!status) - goto done; + return 0; /* * In case multiple PEBS events are sampled at the same time, * it is possible to have GLOBAL_STATUS bit 62 set indicating @@ -2331,6 +2303,146 @@ again: x86_pmu_stop(event, 0); } + return handled; +} + +static bool disable_counter_freezing; +static int __init intel_perf_counter_freezing_setup(char *s) +{ + disable_counter_freezing = true; + pr_info("Intel PMU Counter freezing feature disabled\n"); + return 1; +} +__setup("disable_counter_freezing", intel_perf_counter_freezing_setup); + +/* + * Simplified handler for Arch Perfmon v4: + * - We rely on counter freezing/unfreezing to enable/disable the PMU. + * This is done automatically on PMU ack. + * - Ack the PMU only after the APIC. + */ + +static int intel_pmu_handle_irq_v4(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled = 0; + bool bts = false; + u64 status; + int pmu_enabled = cpuc->enabled; + int loops = 0; + + /* PMU has been disabled because of counter freezing */ + cpuc->enabled = 0; + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + bts = true; + intel_bts_disable_local(); + handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); + } + status = intel_pmu_get_status(); + if (!status) + goto done; +again: + intel_pmu_lbr_read(); + if (++loops > 100) { + static bool warned; + + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } + intel_pmu_reset(); + goto done; + } + + + handled += handle_pmi_common(regs, status); +done: + /* Ack the PMI in the APIC */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * The counters start counting immediately while ack the status. + * Make it as close as possible to IRET. This avoids bogus + * freezing on Skylake CPUs. + */ + if (status) { + intel_pmu_ack_status(status); + } else { + /* + * CPU may issues two PMIs very close to each other. + * When the PMI handler services the first one, the + * GLOBAL_STATUS is already updated to reflect both. + * When it IRETs, the second PMI is immediately + * handled and it sees clear status. At the meantime, + * there may be a third PMI, because the freezing bit + * isn't set since the ack in first PMI handlers. + * Double check if there is more work to be done. + */ + status = intel_pmu_get_status(); + if (status) + goto again; + } + + if (bts) + intel_bts_enable_local(); + cpuc->enabled = pmu_enabled; + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc; + int loops; + u64 status; + int handled; + int pmu_enabled; + + cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + /* + * No known reason to not always do late ACK, + * but just in case do it opt-in. + */ + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); + cpuc->enabled = 0; + __intel_pmu_disable_all(); + handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); + status = intel_pmu_get_status(); + if (!status) + goto done; + + loops = 0; +again: + intel_pmu_lbr_read(); + intel_pmu_ack_status(status); + if (++loops > 100) { + static bool warned; + + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } + intel_pmu_reset(); + goto done; + } + + handled += handle_pmi_common(regs, status); + /* * Repeat if there is more work to be done: */ @@ -3350,6 +3462,9 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (x86_pmu.counter_freezing) + enable_counter_freeze(); + if (!cpuc->shared_regs) return; @@ -3421,6 +3536,9 @@ static void intel_pmu_cpu_dying(int cpu) free_excl_cntrs(cpu); fini_debug_store_on_cpu(cpu); + + if (x86_pmu.counter_freezing) + disable_counter_freeze(); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -3725,6 +3843,40 @@ static __init void intel_nehalem_quirk(void) } } +static bool intel_glp_counter_freezing_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown stepping */ + + switch (cpu_data(cpu).x86_stepping) { + case 1: + rev = 0x28; + break; + case 8: + rev = 0x6; + break; + } + + return (cpu_data(cpu).microcode < rev); +} + +static __init void intel_glp_counter_freezing_quirk(void) +{ + /* Check if it's already disabled */ + if (disable_counter_freezing) + return; + + /* + * If the system starts with the wrong ucode, leave the + * counter-freezing feature permanently disabled. + */ + if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) { + pr_info("PMU counter freezing disabled due to CPU errata," + "please upgrade microcode\n"); + x86_pmu.counter_freezing = false; + x86_pmu.handle_irq = intel_pmu_handle_irq; + } +} + /* * enable software workaround for errata: * SNB: BJ122 @@ -3764,8 +3916,6 @@ EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); static struct attribute *hsw_events_attrs[] = { - EVENT_PTR(mem_ld_hsw), - EVENT_PTR(mem_st_hsw), EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), EVENT_PTR(td_fetch_bubbles), @@ -3776,6 +3926,12 @@ static struct attribute *hsw_events_attrs[] = { NULL }; +static struct attribute *hsw_mem_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL, +}; + static struct attribute *hsw_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_commit), @@ -3792,13 +3948,6 @@ static struct attribute *hsw_tsx_events_attrs[] = { NULL }; -static __init struct attribute **get_hsw_events_attrs(void) -{ - return boot_cpu_has(X86_FEATURE_RTM) ? - merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) : - hsw_events_attrs; -} - static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -3875,9 +4024,32 @@ static struct attribute *intel_pmu_attrs[] = { NULL, }; +static __init struct attribute ** +get_events_attrs(struct attribute **base, + struct attribute **mem, + struct attribute **tsx) +{ + struct attribute **attrs = base; + struct attribute **old; + + if (mem && x86_pmu.pebs) + attrs = merge_attr(attrs, mem); + + if (tsx && boot_cpu_has(X86_FEATURE_RTM)) { + old = attrs; + attrs = merge_attr(attrs, tsx); + if (old != base) + kfree(old); + } + + return attrs; +} + __init int intel_pmu_init(void) { struct attribute **extra_attr = NULL; + struct attribute **mem_attr = NULL; + struct attribute **tsx_attr = NULL; struct attribute **to_free = NULL; union cpuid10_edx edx; union cpuid10_eax eax; @@ -3935,6 +4107,9 @@ __init int intel_pmu_init(void) max((int)edx.split.num_counters_fixed, assume); } + if (version >= 4) + x86_pmu.counter_freezing = !disable_counter_freezing; + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -3986,7 +4161,7 @@ __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - x86_pmu.cpu_events = nhm_events_attrs; + mem_attr = nhm_mem_events_attrs; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4004,11 +4179,11 @@ __init int intel_pmu_init(void) name = "nehalem"; break; - case INTEL_FAM6_ATOM_PINEVIEW: - case INTEL_FAM6_ATOM_LINCROFT: - case INTEL_FAM6_ATOM_PENWELL: - case INTEL_FAM6_ATOM_CLOVERVIEW: - case INTEL_FAM6_ATOM_CEDARVIEW: + case INTEL_FAM6_ATOM_BONNELL: + case INTEL_FAM6_ATOM_BONNELL_MID: + case INTEL_FAM6_ATOM_SALTWELL: + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4021,9 +4196,11 @@ __init int intel_pmu_init(void) name = "bonnell"; break; - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_MID: case INTEL_FAM6_ATOM_AIRMONT: + case INTEL_FAM6_ATOM_AIRMONT_MID: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, @@ -4042,7 +4219,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -4068,7 +4245,8 @@ __init int intel_pmu_init(void) name = "goldmont"; break; - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + x86_add_quirk(intel_glp_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -4112,7 +4290,7 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_westmere_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = nhm_events_attrs; + mem_attr = nhm_mem_events_attrs; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4152,6 +4330,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; + mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4192,6 +4371,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; + mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = @@ -4226,10 +4406,12 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; pr_cont("Haswell events, "); name = "haswell"; break; @@ -4265,10 +4447,12 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; pr_cont("Broadwell events, "); name = "broadwell"; break; @@ -4324,7 +4508,9 @@ __init int intel_pmu_init(void) hsw_format_attr : nhm_format_attr; extra_attr = merge_attr(extra_attr, skl_format_attr); to_free = extra_attr; - x86_pmu.cpu_events = get_hsw_events_attrs(); + x86_pmu.cpu_events = hsw_events_attrs; + mem_attr = hsw_mem_events_attrs; + tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); @@ -4357,6 +4543,9 @@ __init int intel_pmu_init(void) WARN_ON(!x86_pmu.format_attrs); } + x86_pmu.cpu_events = get_events_attrs(x86_pmu.cpu_events, + mem_attr, tsx_attr); + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4431,6 +4620,13 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + /* + * For arch perfmon 4 use counter freezing to avoid + * several MSR accesses in the PMI. + */ + if (x86_pmu.counter_freezing) + x86_pmu.handle_irq = intel_pmu_handle_irq_v4; + kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9f8084f18d58..d2e780705c5a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -559,8 +559,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), @@ -581,9 +581,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f3e006bed9a7..c88ed39582a1 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1272,4 +1272,8 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* Knights Landing does have MISPREDICT bit */ + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) + x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 8d016ce5b80d..3a0aa83cbd07 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -95,7 +95,7 @@ static ssize_t pt_cap_show(struct device *cdev, return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); } -static struct attribute_group pt_cap_group = { +static struct attribute_group pt_cap_group __ro_after_init = { .name = "caps", }; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 32f3e9423e99..91039ffed633 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -777,9 +777,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 51d7c117e3c7..c07bee31abe8 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3061,7 +3061,7 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; @@ -3931,16 +3931,16 @@ static const struct pci_device_id skx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), }, { /* M3UPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0), }, { /* M3UPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1), }, { /* M3UPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2), }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index b4771a6ddbc1..1b9f85abf9bc 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -69,14 +69,14 @@ static bool test_intel(int idx) case INTEL_FAM6_BROADWELL_GT3E: case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_ATOM_SILVERMONT1: - case INTEL_FAM6_ATOM_SILVERMONT2: + case INTEL_FAM6_ATOM_SILVERMONT: + case INTEL_FAM6_ATOM_SILVERMONT_X: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 156286335351..adae087cecdd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -560,9 +560,11 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; - bool late_ack; u64 (*limit_period)(struct perf_event *event, u64 l); + /* PMI handler bits */ + unsigned int late_ack :1, + counter_freezing :1; /* * sysfs attrs */ diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index b21ee65c4101..1c11f9420a82 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,2 +1,6 @@ obj-y := hv_init.o mmu.o nested.o obj-$(CONFIG_X86_64) += hv_apic.o + +ifdef CONFIG_X86_64 +obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o +endif diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5b0f613428c2..8eb6fbee8e13 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -20,7 +20,6 @@ */ #include <linux/types.h> -#include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> @@ -95,8 +94,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) { - struct ipi_arg_ex **arg; - struct ipi_arg_ex *ipi_arg; + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; int ret = 1; @@ -105,7 +104,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) return false; local_irq_save(flags); - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); ipi_arg = *arg; if (unlikely(!ipi_arg)) @@ -135,7 +134,7 @@ ipi_mask_ex_done: static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; - struct ipi_arg_non_ex ipi_arg; + struct hv_send_ipi ipi_arg; int ret = 1; trace_hyperv_send_ipi_mask(mask, vector); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 20c876c7c5bf..7abb09e2eeb8 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -17,6 +17,7 @@ * */ +#include <linux/efi.h> #include <linux/types.h> #include <asm/apic.h> #include <asm/desc.h> @@ -253,6 +254,22 @@ static int hv_cpu_die(unsigned int cpu) return 0; } +static int __init hv_pci_init(void) +{ + int gen2vm = efi_enabled(EFI_BOOT); + + /* + * For Generation-2 VM, we exit from pci_arch_init() by returning 0. + * The purpose is to suppress the harmless warning: + * "PCI: Fatal: No config space access function found" + */ + if (gen2vm) + return 0; + + /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ + return 1; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -329,6 +346,8 @@ void __init hyperv_init(void) hv_apic_init(); + x86_init.pci.arch_init = hv_pci_init; + /* * Register Hyper-V specific clocksource. */ diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c new file mode 100644 index 000000000000..a861b0456b1a --- /dev/null +++ b/arch/x86/hyperv/hv_spinlock.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V specific spinlock code. + * + * Copyright (C) 2018, Intel, Inc. + * + * Author : Yi Sun <yi.y.sun@intel.com> + */ + +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/spinlock.h> + +#include <asm/mshyperv.h> +#include <asm/paravirt.h> +#include <asm/apic.h> + +static bool __initdata hv_pvspin = true; + +static void hv_qlock_kick(int cpu) +{ + apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); +} + +static void hv_qlock_wait(u8 *byte, u8 val) +{ + unsigned long msr_val; + unsigned long flags; + + if (in_nmi()) + return; + + /* + * Reading HV_X64_MSR_GUEST_IDLE MSR tells the hypervisor that the + * vCPU can be put into 'idle' state. This 'idle' state is + * terminated by an IPI, usually from hv_qlock_kick(), even if + * interrupts are disabled on the vCPU. + * + * To prevent a race against the unlock path it is required to + * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE + * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between + * the lock value check and the rdmsrl() then the vCPU might be put + * into 'idle' state by the hypervisor and kept in that state for + * an unspecified amount of time. + */ + local_irq_save(flags); + /* + * Only issue the rdmsrl() when the lock state has not changed. + */ + if (READ_ONCE(*byte) == val) + rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + local_irq_restore(flags); +} + +/* + * Hyper-V does not support this so far. + */ +bool hv_vcpu_is_preempted(int vcpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted); + +void __init hv_init_spinlocks(void) +{ + if (!hv_pvspin || !apic || + !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) || + !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) { + pr_info("PV spinlocks disabled\n"); + return; + } + pr_info("PV spinlocks enabled\n"); + + __pv_init_lock_hash(); + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = hv_qlock_wait; + pv_ops.lock.kick = hv_qlock_kick; + pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted); +} + +static __init int hv_parse_nopvspin(char *arg) +{ + hv_pvspin = false; + return 0; +} +early_param("hv_nopvspin", hv_parse_nopvspin); diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index ef5f29f913d7..e65d7fe6489f 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -231,6 +231,6 @@ void hyperv_setup_mmu_ops(void) return; pr_info("Using hypercall for remote TLB flush\n"); - pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; - pv_mmu_ops.tlb_remove_table = tlb_remove_table; + pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index a303d7b7d763..2f01eb4d6208 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -142,6 +142,8 @@ static inline u64 acpi_arch_get_root_pointer(void) void acpi_generic_reduced_hw_init(void); +u64 x86_default_get_root_pointer(void); + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 @@ -153,6 +155,11 @@ static inline void disable_acpi(void) { } static inline void acpi_generic_reduced_hw_init(void) { } +static inline u64 x86_default_get_root_pointer(void) +{ + return 0; +} + #endif /* !CONFIG_ACPI */ #define ARCH_HAS_POWER_INIT 1 diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e..8e4ea39e55d0 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -7,16 +7,24 @@ #include <asm/asm.h> #ifdef CONFIG_SMP - .macro LOCK_PREFIX -672: lock +.macro LOCK_PREFIX_HERE .pushsection .smp_locks,"a" .balign 4 - .long 672b - . + .long 671f - . # offset .popsection - .endm +671: +.endm + +.macro LOCK_PREFIX insn:vararg + LOCK_PREFIX_HERE + lock \insn +.endm #else - .macro LOCK_PREFIX - .endm +.macro LOCK_PREFIX_HERE +.endm + +.macro LOCK_PREFIX insn:vararg +.endm #endif /* diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4cd6a3b71824..d7faa16622d8 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -31,15 +31,8 @@ */ #ifdef CONFIG_SMP -#define LOCK_PREFIX_HERE \ - ".pushsection .smp_locks,\"a\"\n" \ - ".balign 4\n" \ - ".long 671f - .\n" /* offset */ \ - ".popsection\n" \ - "671:" - -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " - +#define LOCK_PREFIX_HERE "LOCK_PREFIX_HERE\n\t" +#define LOCK_PREFIX "LOCK_PREFIX " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" #define LOCK_PREFIX "" diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index fddb6d26239f..1ae4e5791afa 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -103,6 +103,9 @@ static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) static inline bool amd_gart_present(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return false; + /* GART present only on Fam15h, upto model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 990770f9e76b..21b086786404 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -120,16 +120,32 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ - .pushsection "__ex_table","a" ; \ - .balign 4 ; \ - .long (from) - . ; \ - .long (to) - . ; \ - .long (handler) - . ; \ + ASM_EXTABLE_HANDLE from to handler + +.macro ASM_EXTABLE_HANDLE from:req to:req handler:req + .pushsection "__ex_table","a" + .balign 4 + .long (\from) - . + .long (\to) - . + .long (\handler) - . .popsection +.endm +#else /* __ASSEMBLY__ */ + +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + "ASM_EXTABLE_HANDLE from=" #from " to=" #to \ + " handler=\"" #handler "\"\n\t" + +/* For C file, we already have NOKPROBE_SYMBOL macro */ + +#endif /* __ASSEMBLY__ */ # define _ASM_EXTABLE(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) +# define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) @@ -145,6 +161,7 @@ _ASM_PTR (entry); \ .popsection +#ifdef __ASSEMBLY__ .macro ALIGN_DESTINATION /* check for bad alignment of destination */ movl %edi,%ecx @@ -165,34 +182,10 @@ jmp copy_user_handle_tail .previous - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) + _ASM_EXTABLE_UA(100b, 103b) + _ASM_EXTABLE_UA(101b, 103b) .endm - -#else -# define _EXPAND_EXTABLE_HANDLE(x) #x -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ - " .pushsection \"__ex_table\",\"a\"\n" \ - " .balign 4\n" \ - " .long (" #from ") - .\n" \ - " .long (" #to ") - .\n" \ - " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ - " .popsection\n" - -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - -# define _ASM_EXTABLE_EX(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) - -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - -/* For C file, we already have NOKPROBE_SYMBOL macro */ -#endif +#endif /* __ASSEMBLY__ */ #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b143717b92b3..ea3d95275b43 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -80,11 +80,11 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -#define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } +#define arch_atomic_sub_and_test arch_atomic_sub_and_test /** * arch_atomic_inc - increment atomic variable @@ -92,12 +92,12 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); } +#define arch_atomic_inc arch_atomic_inc /** * arch_atomic_dec - decrement atomic variable @@ -105,12 +105,12 @@ static __always_inline void arch_atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic_dec arch_atomic_dec static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); } +#define arch_atomic_dec arch_atomic_dec /** * arch_atomic_dec_and_test - decrement and test @@ -120,11 +120,11 @@ static __always_inline void arch_atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -#define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } +#define arch_atomic_dec_and_test arch_atomic_dec_and_test /** * arch_atomic_inc_and_test - increment and test @@ -134,11 +134,11 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -#define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } +#define arch_atomic_inc_and_test arch_atomic_inc_and_test /** * arch_atomic_add_negative - add and test if negative @@ -149,11 +149,11 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -#define arch_atomic_add_negative arch_atomic_add_negative static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } +#define arch_atomic_add_negative arch_atomic_add_negative /** * arch_atomic_add_return - add integer and return diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index ef959f02d070..6a5b0ec460da 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -205,12 +205,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic64_inc arch_atomic64_inc static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } +#define arch_atomic64_inc arch_atomic64_inc /** * arch_atomic64_dec - decrement atomic64 variable @@ -218,12 +218,12 @@ static inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic64_dec arch_atomic64_dec static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } +#define arch_atomic64_dec arch_atomic64_dec /** * arch_atomic64_add_unless - add unless the number is a given value @@ -245,7 +245,6 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, return (int)a; } -#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; @@ -253,8 +252,8 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) "S" (v) : "ecx", "edx", "memory"); return r; } +#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; @@ -262,6 +261,7 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) "S" (v) : "ecx", "memory"); return r; } +#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive #undef alternative_atomic64 #undef __alternative_atomic64 diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 4343d9b4f30e..dadc20adba21 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -71,11 +71,11 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } +#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test /** * arch_atomic64_inc - increment atomic64 variable @@ -83,13 +83,13 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -#define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter)); } +#define arch_atomic64_inc arch_atomic64_inc /** * arch_atomic64_dec - decrement atomic64 variable @@ -97,13 +97,13 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -#define arch_atomic64_dec arch_atomic64_dec static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter)); } +#define arch_atomic64_dec arch_atomic64_dec /** * arch_atomic64_dec_and_test - decrement and test @@ -113,11 +113,11 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } +#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test /** * arch_atomic64_inc_and_test - increment and test @@ -127,11 +127,11 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } +#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test /** * arch_atomic64_add_negative - add and test if negative @@ -142,11 +142,11 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -#define arch_atomic64_add_negative arch_atomic64_add_negative static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } +#define arch_atomic64_add_negative arch_atomic64_add_negative /** * arch_atomic64_add_return - add and return diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9f645ba57dbb..124f9195eb3e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -217,8 +217,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -264,8 +263,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -318,8 +316,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), - *addr, "Ir", nr, "%0", c); + return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 6804d6642767..5090035e6d16 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -4,6 +4,8 @@ #include <linux/stringify.h> +#ifndef __ASSEMBLY__ + /* * Despite that some emulators terminate on UD2, we use it for WARN(). * @@ -20,53 +22,15 @@ #define LEN_UD2 2 -#ifdef CONFIG_GENERIC_BUG - -#ifdef CONFIG_X86_32 -# define __BUG_REL(val) ".long " __stringify(val) -#else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" -#endif - -#ifdef CONFIG_DEBUG_BUGVERBOSE - -#define _BUG_FLAGS(ins, flags) \ -do { \ - asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ - "\t.word %c1" "\t# bug_entry::line\n" \ - "\t.word %c2" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c3\n" \ - ".popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (flags), \ - "i" (sizeof(struct bug_entry))); \ -} while (0) - -#else /* !CONFIG_DEBUG_BUGVERBOSE */ - #define _BUG_FLAGS(ins, flags) \ do { \ - asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t.word %c0" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c1\n" \ - ".popsection" \ - : : "i" (flags), \ + asm volatile("ASM_BUG ins=\"" ins "\" file=%c0 line=%c1 " \ + "flags=%c2 size=%c3" \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) -#endif /* CONFIG_DEBUG_BUGVERBOSE */ - -#else - -#define _BUG_FLAGS(ins, flags) asm volatile(ins) - -#endif /* CONFIG_GENERIC_BUG */ - #define HAVE_ARCH_BUG #define BUG() \ do { \ @@ -82,4 +46,54 @@ do { \ #include <asm-generic/bug.h> +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_GENERIC_BUG + +#ifdef CONFIG_X86_32 +.macro __BUG_REL val:req + .long \val +.endm +#else +.macro __BUG_REL val:req + .long \val - 2b +.endm +#endif + +#ifdef CONFIG_DEBUG_BUGVERBOSE + +.macro ASM_BUG ins:req file:req line:req flags:req size:req +1: \ins + .pushsection __bug_table,"aw" +2: __BUG_REL val=1b # bug_entry::bug_addr + __BUG_REL val=\file # bug_entry::file + .word \line # bug_entry::line + .word \flags # bug_entry::flags + .org 2b+\size + .popsection +.endm + +#else /* !CONFIG_DEBUG_BUGVERBOSE */ + +.macro ASM_BUG ins:req file:req line:req flags:req size:req +1: \ins + .pushsection __bug_table,"aw" +2: __BUG_REL val=1b # bug_entry::bug_addr + .word \flags # bug_entry::flags + .org 2b+\size + .popsection +.endm + +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + +#else /* CONFIG_GENERIC_BUG */ + +.macro ASM_BUG ins:req file:req line:req flags:req size:req + \ins +.endm + +#endif /* CONFIG_GENERIC_BUG */ + +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index e958e28f7ab5..86b63c7feab7 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -3,5 +3,6 @@ #define _ASM_X86_CACHEINFO_H void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a55d79b233d3..bfb85e5844ab 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -242,10 +242,12 @@ extern void __add_wrong_size(void) BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ - : "=a" (__ret), "+d" (__old2), \ - "+m" (*(p1)), "+m" (*(p2)) \ - : "i" (2 * sizeof(long)), "a" (__old1), \ + asm volatile(pfx "cmpxchg%c5b %1" \ + CC_SET(e) \ + : CC_OUT(e) (__ret), \ + "+m" (*(p1)), "+m" (*(p2)), \ + "+a" (__old1), "+d" (__old2) \ + : "i" (2 * sizeof(long)), \ "b" (__new1), "c" (__new2)); \ __ret; \ }) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index fb97cf7c4137..fab4df16a3c4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -12,38 +12,23 @@ #include <asm/user32.h> #include <asm/unistd.h> +#include <asm-generic/compat.h> + #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" -typedef u32 compat_size_t; -typedef s32 compat_ssize_t; -typedef s32 compat_clock_t; -typedef s32 compat_pid_t; typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; typedef u16 compat_mode_t; -typedef u32 compat_ino_t; typedef u16 compat_dev_t; -typedef s32 compat_off_t; -typedef s64 compat_loff_t; typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; -typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; -typedef s32 compat_timer_t; -typedef s32 compat_key_t; - -typedef s32 compat_int_t; -typedef s32 compat_long_t; typedef s64 __attribute__((aligned(4))) compat_s64; -typedef u32 compat_uint_t; -typedef u32 compat_ulong_t; -typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; -typedef u32 compat_uptr_t; struct compat_stat { compat_dev_t st_dev; @@ -240,6 +225,6 @@ static inline bool in_compat_syscall(void) struct compat_siginfo; int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const siginfo_t *from, bool x32_ABI); + const kernel_siginfo_t *from, bool x32_ABI); #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 4a7884b8dca5..29c706415443 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -30,8 +30,6 @@ struct cpu_entry_area { */ struct tss_struct tss; - char entry_trampoline[PAGE_SIZE]; - #ifdef CONFIG_X86_64 /* * Exception stacks used for IST entries. diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aced6c9290d6..7d442722ef24 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -2,10 +2,10 @@ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#include <asm/processor.h> - -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ +#include <asm/processor.h> #include <asm/asm.h> #include <linux/bitops.h> @@ -161,37 +161,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline __pure bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P[always]\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P[feature]\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" + asm_volatile_goto("STATIC_CPU_HAS bitnum=%[bitnum] " + "cap_byte=\"%[cap_byte]\" " + "feature=%P[feature] t_yes=%l[t_yes] " + "t_no=%l[t_no] always=%P[always]" : : [feature] "i" (bit), [always] "i" (X86_FEATURE_ALWAYS), [bitnum] "i" (1 << (bit & 7)), @@ -226,5 +199,44 @@ t_no: #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model -#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ +#else /* __ASSEMBLY__ */ + +.macro STATIC_CPU_HAS bitnum:req cap_byte:req feature:req t_yes:req t_no:req always:req +1: + jmp 6f +2: + .skip -(((5f-4f) - (2b-1b)) > 0) * ((5f-4f) - (2b-1b)),0x90 +3: + .section .altinstructions,"a" + .long 1b - . /* src offset */ + .long 4f - . /* repl offset */ + .word \always /* always replace */ + .byte 3b - 1b /* src len */ + .byte 5f - 4f /* repl len */ + .byte 3b - 2b /* pad len */ + .previous + .section .altinstr_replacement,"ax" +4: + jmp \t_no +5: + .previous + .section .altinstructions,"a" + .long 1b - . /* src offset */ + .long 0 /* no replacement */ + .word \feature /* feature bit */ + .byte 3b - 1b /* src len */ + .byte 0 /* repl len */ + .byte 0 /* pad len */ + .previous + .section .altinstr_aux,"ax" +6: + testb \bitnum,\cap_byte + jnz \t_yes + jmp \t_no + .previous +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* __KERNEL__ */ #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 4505ac2735ad..9e5ca30738e5 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -8,7 +8,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 13c5ee878a47..68a99d2a5f33 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -108,7 +108,7 @@ static inline int desc_empty(const void *ptr) return !(desc[0] | desc[1]); } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define load_TR_desc() native_load_tr_desc() @@ -134,7 +134,7 @@ static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { } -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ #define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index cec5fae23eb3..eea40d52ca78 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -140,6 +140,7 @@ extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); +extern void efi_recover_from_page_fault(unsigned long phys_addr); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 0d157d2a1e2a..69c0f892e310 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -10,6 +10,7 @@ #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> +#include <asm/fsgsbase.h> typedef unsigned long elf_greg_t; @@ -62,8 +63,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ - -#define R_X86_64_NUM 16 +#define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * These are used to set parameters in the core dumps. @@ -205,7 +205,6 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ - unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -228,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ - rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ + (pr_reg)[21] = x86_fsbase_read_cpu(); \ + (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index f9c3a5d502f4..d8c2198d543b 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -29,7 +29,8 @@ struct pt_regs; (b)->handler = (tmp).handler - (delta); \ } while (0) -extern int fixup_exception(struct pt_regs *regs, int trapnr); +extern int fixup_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..50ba74a34a37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/acpi.h> @@ -152,7 +162,7 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PARAVIRT_XXL static inline void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a38bf5a1e37a..5f7290e6e954 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -226,7 +226,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) "3: movl $-2,%[err]\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -528,7 +528,7 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->initialized) { + if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h new file mode 100644 index 000000000000..eb377b6e9eed --- /dev/null +++ b/arch/x86/include/asm/fsgsbase.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_FSGSBASE_H +#define _ASM_FSGSBASE_H + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_64 + +#include <asm/msr-index.h> + +/* + * Read/write a task's FSBASE or GSBASE. This returns the value that + * the FS/GS base would have (if the task were to be resumed). These + * work on the current task or on a non-running (typically stopped + * ptrace child) task. + */ +extern unsigned long x86_fsbase_read_task(struct task_struct *task); +extern unsigned long x86_gsbase_read_task(struct task_struct *task); +extern int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); +extern int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + rdmsrl(MSR_FS_BASE, fsbase); + + return fsbase; +} + +static inline unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + + return gsbase; +} + +extern void x86_fsbase_write_cpu(unsigned long fsbase); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); + +#endif /* CONFIG_X86_64 */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_FSGSBASE_H */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index de4d68852d3a..13c83fe97988 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -20,7 +20,7 @@ "3:\tmov\t%3, %1\n" \ "\tjmp\t2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ : "i" (-EFAULT), "0" (oparg), "1" (0)) @@ -36,8 +36,8 @@ "4:\tmov\t%5, %1\n" \ "\tjmp\t3b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=&a" (oldval), "=&r" (ret), \ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e977b6b3a538..4139f7650fe5 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -38,6 +38,8 @@ #define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) /* Partition reference TSC MSR is available */ #define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) +/* Partition Guest IDLE MSR is available */ +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 @@ -246,6 +248,9 @@ #define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 #define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +/* Hyper-V guest idle MSR */ +#define HV_X64_MSR_GUEST_IDLE 0x400000F0 + /* Hyper-V guest crash notification MSR's */ #define HV_X64_MSR_CRASH_P0 0x40000100 #define HV_X64_MSR_CRASH_P1 0x40000101 @@ -726,19 +731,21 @@ struct hv_enlightened_vmcs { #define HV_STIMER_AUTOENABLE (1ULL << 3) #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) -struct ipi_arg_non_ex { - u32 vector; - u32 reserved; - u64 cpu_mask; -}; - struct hv_vpset { u64 format; u64 valid_bank_mask; u64 bank_contents[]; }; -struct ipi_arg_ex { +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { u32 vector; u32 reserved; struct hv_vpset vp_set; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 7ed08a7c3398..0dd6b0f4000e 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -8,9 +8,6 @@ * The "_X" parts are generally the EP and EX Xeons, or the * "Extreme" ones, like Broadwell-E. * - * Things ending in "2" are usually because we have no better - * name for them. There's no processor called "SILVERMONT2". - * * While adding a new CPUID for a new microarchitecture, add a new * group to keep logically sorted out in chronological order. Within * that group keep the CPUID for the variants sorted by model number. @@ -57,19 +54,23 @@ /* "Small Core" Processors (Atom) */ -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C -#define INTEL_FAM6_ATOM_LINCROFT 0x26 -#define INTEL_FAM6_ATOM_PENWELL 0x27 -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ + +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ + +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ + +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ + +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 6de64840dd22..832da8229cc7 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size) #define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc - extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); #define ioremap_prot ioremap_prot +extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); +#define ioremap_encrypted ioremap_encrypted /** * ioremap - map bus memory into CPU space @@ -369,18 +370,6 @@ extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); -#ifdef CONFIG_XEN -#include <xen/xen.h> -struct bio_vec; - -extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2); - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ - (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) -#endif /* CONFIG_XEN */ - #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 15450a675031..058e40fed167 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -64,7 +64,7 @@ static inline __cpuidle void native_halt(void) #endif -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #ifndef __ASSEMBLY__ @@ -123,6 +123,10 @@ static inline notrace unsigned long arch_local_irq_save(void) #define DISABLE_INTERRUPTS(x) cli #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x) pushfq; popq %rax +#endif + #define SWAPGS swapgs /* * Currently paravirt can't handle swapgs nicely when we @@ -135,8 +139,6 @@ static inline notrace unsigned long arch_local_irq_save(void) */ #define SWAPGS_UNSAFE_STACK swapgs -#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ - #define INTERRUPT_RETURN jmp native_iret #define USERGS_SYSRET64 \ swapgs; \ @@ -145,18 +147,12 @@ static inline notrace unsigned long arch_local_irq_save(void) swapgs; \ sysretl -#ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(x) pushfq; popq %rax -#endif #else #define INTERRUPT_RETURN iret -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define GET_CR0_INTO_EAX movl %cr0, %eax #endif - #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLY__ static inline int arch_irqs_disabled_flags(unsigned long flags) diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 8c0de4282659..a5fb34fe56a4 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,19 +2,6 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifndef HAVE_JUMP_LABEL -/* - * For better or for worse, if jump labels (the gcc extension) are missing, - * then the entire static branch patching infrastructure is compiled out. - * If that happens, the code in here will malfunction. Raise a compiler - * error instead. - * - * In theory, jump labels and the static branch patching infrastructure - * could be decoupled to fix this. - */ -#error asm/jump_label.h included on a non-jump-label kernel -#endif - #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 @@ -33,14 +20,9 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" - ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" - ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); - + asm_volatile_goto("STATIC_BRANCH_NOP l_yes=\"%l[l_yes]\" key=\"%c0\" " + "branch=\"%c1\"" + : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; @@ -48,13 +30,8 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:" - ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" - "2:\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" - ".popsection \n\t" + asm_volatile_goto("STATIC_BRANCH_JMP l_yes=\"%l[l_yes]\" key=\"%c0\" " + "branch=\"%c1\"" : : "i" (key), "i" (branch) : : l_yes); return false; @@ -62,49 +39,28 @@ l_yes: return true; } -#ifdef CONFIG_X86_64 -typedef u64 jump_label_t; -#else -typedef u32 jump_label_t; -#endif - -struct jump_entry { - jump_label_t code; - jump_label_t target; - jump_label_t key; -}; - #else /* __ASSEMBLY__ */ -.macro STATIC_JUMP_IF_TRUE target, key, def -.Lstatic_jump_\@: - .if \def - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .else - .byte STATIC_KEY_INIT_NOP - .endif +.macro STATIC_BRANCH_NOP l_yes:req key:req branch:req +.Lstatic_branch_nop_\@: + .byte STATIC_KEY_INIT_NOP +.Lstatic_branch_no_after_\@: .pushsection __jump_table, "aw" _ASM_ALIGN - _ASM_PTR .Lstatic_jump_\@, \target, \key + .long .Lstatic_branch_nop_\@ - ., \l_yes - . + _ASM_PTR \key + \branch - . .popsection .endm -.macro STATIC_JUMP_IF_FALSE target, key, def -.Lstatic_jump_\@: - .if \def - .byte STATIC_KEY_INIT_NOP - .else - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .endif +.macro STATIC_BRANCH_JMP l_yes:req key:req branch:req +.Lstatic_branch_jmp_\@: + .byte 0xe9 + .long \l_yes - .Lstatic_branch_jmp_after_\@ +.Lstatic_branch_jmp_after_\@: .pushsection __jump_table, "aw" _ASM_ALIGN - _ASM_PTR .Lstatic_jump_\@, \target, \key + 1 + .long .Lstatic_branch_jmp_\@ - ., \l_yes - . + _ASM_PTR \key + \branch - . .popsection .endm diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 395c9631e000..75f1e35e7c15 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -22,10 +22,20 @@ enum die_val { DIE_NMIUNKNOWN, }; +enum show_regs_mode { + SHOW_REGS_SHORT, + /* + * For when userspace crashed, but we don't think it's our fault, and + * therefore don't print kernel registers. + */ + SHOW_REGS_USER, + SHOW_REGS_ALL +}; + extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, int all); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0f82cd91cd3c..93c4bf598fb0 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -364,6 +364,10 @@ struct x86_emulate_ctxt { #define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 +#define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948 +#define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975 +#define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e + #define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 00ddb0c9e612..09b2e3e2cf1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -869,6 +869,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; }; struct kvm_vm_stat { @@ -1022,6 +1024,7 @@ struct kvm_x86_ops { void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); @@ -1055,6 +1058,7 @@ struct kvm_x86_ops { bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1237,19 +1241,12 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_RETRY (1 << 3) -#define EMULTYPE_NO_REEXECUTE (1 << 4) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 5) -#define EMULTYPE_VMWARE (1 << 6) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - int emulation_type, void *insn, int insn_len); - -static inline int emulate_instruction(struct kvm_vcpu *vcpu, - int emulation_type) -{ - return x86_emulate_instruction(vcpu, 0, - emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); -} +#define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) +#define EMULTYPE_VMWARE (1 << 5) +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -1450,7 +1447,6 @@ asmlinkage void kvm_spurious_fault(void); ____kvm_handle_fault_on_reboot(insn, "") #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); @@ -1463,7 +1459,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); u64 kvm_get_arch_capabilities(void); @@ -1490,6 +1486,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index c91083c59845..349a47acaa4a 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -53,7 +53,7 @@ static inline void local_sub(long i, local_t *l) */ static inline bool local_sub_and_test(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** @@ -66,7 +66,7 @@ static inline bool local_sub_and_test(long i, local_t *l) */ static inline bool local_dec_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e); + return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** @@ -79,7 +79,7 @@ static inline bool local_dec_and_test(local_t *l) */ static inline bool local_inc_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e); + return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** @@ -93,7 +93,7 @@ static inline bool local_inc_and_test(local_t *l) */ static inline bool local_add_negative(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3a17107594c8..4da9b1c58d28 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -10,41 +10,44 @@ /* MCG_CAP register defines */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ +#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ +#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) -#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ -#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ -#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ +#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */ +#define MCG_ELOG_P BIT_ULL(26) /* Extended error log supported */ +#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */ /* MCG_STATUS register defines */ -#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ -#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ +#define MCG_STATUS_RIPV BIT_ULL(0) /* restart ip valid */ +#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ +#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ /* MCG_EXT_CTL register defines */ -#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ +#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ /* MCi_STATUS register defines */ -#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ -#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ -#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ -#define MCI_STATUS_AR (1ULL<<55) /* Action required */ +#define MCI_STATUS_VAL BIT_ULL(63) /* valid error */ +#define MCI_STATUS_OVER BIT_ULL(62) /* previous errors lost */ +#define MCI_STATUS_UC BIT_ULL(61) /* uncorrected error */ +#define MCI_STATUS_EN BIT_ULL(60) /* error enabled */ +#define MCI_STATUS_MISCV BIT_ULL(59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV BIT_ULL(58) /* addr reg. valid */ +#define MCI_STATUS_PCC BIT_ULL(57) /* processor context corrupt */ +#define MCI_STATUS_S BIT_ULL(56) /* Signaled machine check */ +#define MCI_STATUS_AR BIT_ULL(55) /* Action required */ +#define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */ +#define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38) +#define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT) /* AMD-specific bits */ -#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ -#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ -#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ -#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ +#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ +#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ +#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -84,7 +87,7 @@ #define MCI_MISC_ADDR_GENERIC 7 /* generic */ /* CTL2 register defines */ -#define MCI_CTL2_CMCI_EN (1ULL << 30) +#define MCI_CTL2_CMCI_EN BIT_ULL(30) #define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL #define MCJ_CTX_MASK 3 @@ -214,6 +217,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif +static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } + int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index eeeb9289c764..0ca50611e8ce 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -16,12 +16,12 @@ extern atomic64_t last_mm_ctx_id; -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { } -#endif /* !CONFIG_PARAVIRT */ +#endif /* !CONFIG_PARAVIRT_XXL */ #ifdef CONFIG_PERF_EVENTS diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 61eb4b63c5ec..d0b1434fb0b6 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -57,8 +57,14 @@ #define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) #define MPX_BNDSTA_ERROR_CODE 0x3 +struct mpx_fault_info { + void __user *addr; + void __user *lower; + void __user *upper; +}; + #ifdef CONFIG_X86_INTEL_MPX -siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); +int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs); int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { @@ -78,9 +84,9 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, unsigned long flags); #else -static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) +static inline int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) { - return NULL; + return -EINVAL; } static inline int mpx_handle_bd_fault(void) { diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f37704497d8f..0d6271cce198 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -351,6 +351,8 @@ int hyperv_flush_guest_mapping(u64 as); #ifdef CONFIG_X86_64 void hv_apic_init(void); +void __init hv_init_spinlocks(void); +bool hv_vcpu_is_preempted(int vcpu); #else static inline void hv_apic_init(void) {} #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4731f0cf97c5..80f4a4f38c79 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -164,6 +164,7 @@ #define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 04addd6e0a4a..91e4cf189914 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -242,7 +242,7 @@ static inline unsigned long long native_read_pmc(int counter) return EAX_EDX_VAL(val, low, high); } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #include <linux/errno.h> @@ -305,7 +305,7 @@ do { \ #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) -#endif /* !CONFIG_PARAVIRT */ +#endif /* !CONFIG_PARAVIRT_XXL */ /* * 64-bit version of wrmsr_safe(): diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fd2a8c1b88bc..80dc14422495 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -170,11 +170,15 @@ */ # define CALL_NOSPEC \ ANNOTATE_NOSPEC_ALTERNATIVE \ - ALTERNATIVE( \ + ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE) + X86_FEATURE_RETPOLINE, \ + "lfence;\n" \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) @@ -184,7 +188,8 @@ * here, anyway. */ # define CALL_NOSPEC \ - ALTERNATIVE( \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ " jmp 904f;\n" \ @@ -199,7 +204,11 @@ " ret;\n" \ " .align 16\n" \ "904: call 901b;\n", \ - X86_FEATURE_RETPOLINE) + X86_FEATURE_RETPOLINE, \ + "lfence;\n" \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #else /* No retpoline for C / inline asm */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6afac386a434..cd0cf1c568b4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -59,13 +59,16 @@ #endif /* - * Kernel image size is limited to 1GiB due to the fixmap living in the - * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use - * 512MiB by default, leaving 1.5GiB for modules once the page tables - * are fully set up. If kernel ASLR is configured, it can extend the - * kernel page table mapping, reducing the size of the modules area. + * Maximum kernel image size is limited to 1 GiB, due to the fixmap living + * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). + * + * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the + * page tables are fully set up. + * + * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size + * of the modules area to 1.5 GiB. */ -#if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index e375d4266b53..4bf42f9e4eea 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -17,16 +17,73 @@ #include <linux/cpumask.h> #include <asm/frame.h> +static inline unsigned long long paravirt_sched_clock(void) +{ + return PVOP_CALL0(unsigned long long, time.sched_clock); +} + +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return PVOP_CALL1(u64, time.steal_clock, cpu); +} + +/* The paravirtualized I/O functions */ +static inline void slow_down_io(void) +{ + pv_ops.cpu.io_delay(); +#ifdef REALLY_SLOW_IO + pv_ops.cpu.io_delay(); + pv_ops.cpu.io_delay(); + pv_ops.cpu.io_delay(); +#endif +} + +static inline void __flush_tlb(void) +{ + PVOP_VCALL0(mmu.flush_tlb_user); +} + +static inline void __flush_tlb_global(void) +{ + PVOP_VCALL0(mmu.flush_tlb_kernel); +} + +static inline void __flush_tlb_one_user(unsigned long addr) +{ + PVOP_VCALL1(mmu.flush_tlb_one_user, addr); +} + +static inline void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info); +} + +static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); +} + +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) +{ + PVOP_VCALL1(mmu.exit_mmap, mm); +} + +#ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); + PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx); + PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* @@ -34,98 +91,98 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, */ static inline unsigned long paravirt_get_debugreg(int reg) { - return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg); + return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static inline void set_debugreg(unsigned long val, int reg) { - PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); + PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); + return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { - PVOP_VCALL1(pv_cpu_ops.write_cr0, x); + PVOP_VCALL1(cpu.write_cr0, x); } static inline unsigned long read_cr2(void) { - return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2); + return PVOP_CALL0(unsigned long, mmu.read_cr2); } static inline void write_cr2(unsigned long x) { - PVOP_VCALL1(pv_mmu_ops.write_cr2, x); + PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { - return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); + return PVOP_CALL0(unsigned long, mmu.read_cr3); } static inline void write_cr3(unsigned long x) { - PVOP_VCALL1(pv_mmu_ops.write_cr3, x); + PVOP_VCALL1(mmu.write_cr3, x); } static inline void __write_cr4(unsigned long x) { - PVOP_VCALL1(pv_cpu_ops.write_cr4, x); + PVOP_VCALL1(cpu.write_cr4, x); } #ifdef CONFIG_X86_64 static inline unsigned long read_cr8(void) { - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8); + return PVOP_CALL0(unsigned long, cpu.read_cr8); } static inline void write_cr8(unsigned long x) { - PVOP_VCALL1(pv_cpu_ops.write_cr8, x); + PVOP_VCALL1(cpu.write_cr8, x); } #endif static inline void arch_safe_halt(void) { - PVOP_VCALL0(pv_irq_ops.safe_halt); + PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { - PVOP_VCALL0(pv_irq_ops.halt); + PVOP_VCALL0(irq.halt); } static inline void wbinvd(void) { - PVOP_VCALL0(pv_cpu_ops.wbinvd); + PVOP_VCALL0(cpu.wbinvd); } #define get_kernel_rpl() (pv_info.kernel_rpl) static inline u64 paravirt_read_msr(unsigned msr) { - return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr); + return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { - PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); + PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { - return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err); + return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { - return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high); + return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ @@ -170,23 +227,9 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline unsigned long long paravirt_sched_clock(void) -{ - return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); -} - -struct static_key; -extern struct static_key paravirt_steal_enabled; -extern struct static_key paravirt_steal_rq_enabled; - -static inline u64 paravirt_steal_clock(int cpu) -{ - return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); -} - static inline unsigned long long paravirt_read_pmc(int counter) { - return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); + return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ @@ -200,166 +243,127 @@ do { \ static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); + PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { - PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries); + PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { - PVOP_VCALL0(pv_cpu_ops.load_tr_desc); + PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { - PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr); + PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { - PVOP_VCALL1(pv_cpu_ops.load_idt, dtr); + PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { - PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); + PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { - return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); + return PVOP_CALL0(unsigned long, cpu.store_tr); } + #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { - PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu); + PVOP_VCALL2(cpu.load_tls, t, cpu); } #ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int gs) { - PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs); + PVOP_VCALL1(cpu.load_gs_index, gs); } #endif static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { - PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc); + PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { - PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type); + PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { - PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g); + PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } static inline void set_iopl_mask(unsigned mask) { - PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); -} - -/* The paravirtualized I/O functions */ -static inline void slow_down_io(void) -{ - pv_cpu_ops.io_delay(); -#ifdef REALLY_SLOW_IO - pv_cpu_ops.io_delay(); - pv_cpu_ops.io_delay(); - pv_cpu_ops.io_delay(); -#endif + PVOP_VCALL1(cpu.set_iopl_mask, mask); } static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { - PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); + PVOP_VCALL2(mmu.activate_mm, prev, next); } static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { - PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); -} - -static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) -{ - PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); -} - -static inline void __flush_tlb(void) -{ - PVOP_VCALL0(pv_mmu_ops.flush_tlb_user); -} -static inline void __flush_tlb_global(void) -{ - PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel); -} -static inline void __flush_tlb_one_user(unsigned long addr) -{ - PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr); -} - -static inline void flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ - PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); -} - -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table); + PVOP_VCALL2(mmu.dup_mmap, oldmm, mm); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { - return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); + return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { - PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd); + PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { - PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn); + PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { - PVOP_VCALL1(pv_mmu_ops.release_pte, pfn); + PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { - PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); + PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { - PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); + PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { - PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn); + PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { - PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); + PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { - PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn); + PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { - PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); + PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) @@ -367,13 +371,9 @@ static inline pte_t __pte(pteval_t val) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, - pv_mmu_ops.make_pte, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pteval_t, mmu.make_pte, val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pteval_t, - pv_mmu_ops.make_pte, - val); + ret = PVOP_CALLEE1(pteval_t, mmu.make_pte, val); return (pte_t) { .pte = ret }; } @@ -383,11 +383,10 @@ static inline pteval_t pte_val(pte_t pte) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val, + ret = PVOP_CALLEE2(pteval_t, mmu.pte_val, pte.pte, (u64)pte.pte >> 32); else - ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val, - pte.pte); + ret = PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); return ret; } @@ -397,11 +396,9 @@ static inline pgd_t __pgd(pgdval_t val) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pgdval_t, mmu.make_pgd, val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd, - val); + ret = PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val); return (pgd_t) { ret }; } @@ -411,11 +408,10 @@ static inline pgdval_t pgd_val(pgd_t pgd) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val, + ret = PVOP_CALLEE2(pgdval_t, mmu.pgd_val, pgd.pgd, (u64)pgd.pgd >> 32); else - ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd); + ret = PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); return ret; } @@ -426,8 +422,7 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long a { pteval_t ret; - ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start, - mm, addr, ptep); + ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); return (pte_t) { .pte = ret }; } @@ -437,20 +432,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long a { if (sizeof(pteval_t) > sizeof(long)) /* 5 arg words */ - pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte); + pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); else - PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit, + PVOP_VCALL4(mmu.ptep_modify_prot_commit, mm, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { if (sizeof(pteval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, - pte.pte, (u64)pte.pte >> 32); + PVOP_VCALL3(mmu.set_pte, ptep, pte.pte, (u64)pte.pte >> 32); else - PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, - pte.pte); + PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -458,9 +451,9 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, { if (sizeof(pteval_t) > sizeof(long)) /* 5 arg words */ - pv_mmu_ops.set_pte_at(mm, addr, ptep, pte); + pv_ops.mmu.set_pte_at(mm, addr, ptep, pte); else - PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); + PVOP_VCALL4(mmu.set_pte_at, mm, addr, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) @@ -468,9 +461,9 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) pmdval_t val = native_pmd_val(pmd); if (sizeof(pmdval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32); + PVOP_VCALL3(mmu.set_pmd, pmdp, val, (u64)val >> 32); else - PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); + PVOP_VCALL2(mmu.set_pmd, pmdp, val); } #if CONFIG_PGTABLE_LEVELS >= 3 @@ -479,11 +472,9 @@ static inline pmd_t __pmd(pmdval_t val) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pmdval_t, mmu.make_pmd, val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd, - val); + ret = PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val); return (pmd_t) { ret }; } @@ -493,11 +484,10 @@ static inline pmdval_t pmd_val(pmd_t pmd) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val, + ret = PVOP_CALLEE2(pmdval_t, mmu.pmd_val, pmd.pmd, (u64)pmd.pmd >> 32); else - ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd); + ret = PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); return ret; } @@ -507,39 +497,23 @@ static inline void set_pud(pud_t *pudp, pud_t pud) pudval_t val = native_pud_val(pud); if (sizeof(pudval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_pud, pudp, - val, (u64)val >> 32); + PVOP_VCALL3(mmu.set_pud, pudp, val, (u64)val >> 32); else - PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, - val); + PVOP_VCALL2(mmu.set_pud, pudp, val); } #if CONFIG_PGTABLE_LEVELS >= 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; - if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud, - val, (u64)val >> 32); - else - ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud, - val); + ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { - pudval_t ret; - - if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val, - pud.pud, (u64)pud.pud >> 32); - else - ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val, - pud.pud); - - return ret; + return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); } static inline void pud_clear(pud_t *pudp) @@ -551,31 +525,26 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); - if (sizeof(p4dval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp, - val, (u64)val >> 32); - else - PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp, - val); + PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { - p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val); + p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { - return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); + return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); + PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ @@ -606,19 +575,18 @@ static inline void p4d_clear(p4d_t *p4dp) 64-bit pte atomically */ static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { - PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep, - pte.pte, pte.pte >> 32); + PVOP_VCALL3(mmu.set_pte_atomic, ptep, pte.pte, pte.pte >> 32); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep); + PVOP_VCALL3(mmu.pte_clear, mm, addr, ptep); } static inline void pmd_clear(pmd_t *pmdp) { - PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp); + PVOP_VCALL1(mmu.pmd_clear, pmdp); } #else /* !CONFIG_X86_PAE */ static inline void set_pte_atomic(pte_t *ptep, pte_t pte) @@ -641,64 +609,68 @@ static inline void pmd_clear(pmd_t *pmdp) #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { - PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); + PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { - PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); + PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { - PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter); + PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { - PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); + PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { - PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush); + PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { - pv_mmu_ops.set_fixmap(idx, phys, flags); + pv_ops.mmu.set_fixmap(idx, phys, flags); } +#endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { - PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); + PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { - PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); + PVOP_VCALLEE1(lock.queued_spin_unlock, lock); } static __always_inline void pv_wait(u8 *ptr, u8 val) { - PVOP_VCALL2(pv_lock_ops.wait, ptr, val); + PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { - PVOP_VCALL1(pv_lock_ops.kick, cpu); + PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { - return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); + return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu); } +void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); +bool __raw_callee_save___native_vcpu_is_preempted(long cpu); + #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 @@ -778,24 +750,25 @@ static __always_inline bool pv_vcpu_is_preempted(long cpu) #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) +#ifdef CONFIG_PARAVIRT_XXL static inline notrace unsigned long arch_local_save_flags(void) { - return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); + return PVOP_CALLEE0(unsigned long, irq.save_fl); } static inline notrace void arch_local_irq_restore(unsigned long f) { - PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); + PVOP_VCALLEE1(irq.restore_fl, f); } static inline notrace void arch_local_irq_disable(void) { - PVOP_VCALLEE0(pv_irq_ops.irq_disable); + PVOP_VCALLEE0(irq.irq_disable); } static inline notrace void arch_local_irq_enable(void) { - PVOP_VCALLEE0(pv_irq_ops.irq_enable); + PVOP_VCALLEE0(irq.irq_enable); } static inline notrace unsigned long arch_local_irq_save(void) @@ -806,6 +779,7 @@ static inline notrace unsigned long arch_local_irq_save(void) arch_local_irq_disable(); return f; } +#endif /* Make sure as little as possible of this mess escapes. */ @@ -827,7 +801,7 @@ extern void default_banner(void); #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, clobbers, ops, word, algn) \ +#define _PVSITE(ptype, ops, word, algn) \ 771:; \ ops; \ 772:; \ @@ -836,7 +810,6 @@ extern void default_banner(void); word 771b; \ .byte ptype; \ .byte 772b-771b; \ - .short clobbers; \ .popsection @@ -868,8 +841,8 @@ extern void default_banner(void); COND_POP(set, CLBR_RCX, rcx); \ COND_POP(set, CLBR_RAX, rax) -#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) -#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) +#define PARA_PATCH(off) ((off) / 8) +#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) #else #define PV_SAVE_REGS(set) \ @@ -883,46 +856,41 @@ extern void default_banner(void); COND_POP(set, CLBR_EDI, edi); \ COND_POP(set, CLBR_EAX, eax) -#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) -#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) +#define PARA_PATCH(off) ((off) / 4) +#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4) #define PARA_INDIRECT(addr) *%cs:addr #endif +#ifdef CONFIG_PARAVIRT_XXL #define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);) + PARA_SITE(PARA_PATCH(PV_CPU_iret), \ + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);) #define DISABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ + PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define ENABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ + PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif -#ifdef CONFIG_X86_32 -#define GET_CR0_INTO_EAX \ - push %ecx; push %edx; \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ - pop %edx; pop %ecx -#else /* !CONFIG_X86_32 */ - +#ifdef CONFIG_X86_64 +#ifdef CONFIG_PARAVIRT_XXL /* * If swapgs is used while the userspace stack is still current, * there's no way to call a pvop. The PV replacement *must* be * inlined, or the swapgs instruction must be trapped and emulated. */ #define SWAPGS_UNSAFE_STACK \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - swapgs) + PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs) /* * Note: swapgs is very special, and in practise is either going to be @@ -931,44 +899,51 @@ extern void default_banner(void); * it. */ #define SWAPGS \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ + PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ ) +#endif #define GET_CR2_INTO_RAX \ ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); + call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); +#ifdef CONFIG_PARAVIRT_XXL #define USERGS_SYSRET64 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ - CLBR_NONE, \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);) + PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);) #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ + PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif +#endif #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop +#endif /* !CONFIG_PARAVIRT */ + #ifndef __ASSEMBLY__ +#ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { } +#endif +#ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } +#endif #endif /* __ASSEMBLY__ */ -#endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4b75acc23b30..fba54ca23b2a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -66,12 +66,14 @@ struct paravirt_callee_save { /* general info */ struct pv_info { +#ifdef CONFIG_PARAVIRT_XXL unsigned int kernel_rpl; int shared_kernel_pmd; #ifdef CONFIG_X86_64 u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif +#endif const char *name; }; @@ -85,17 +87,18 @@ struct pv_init_ops { * the number of bytes of code generated, as we nop pad the * rest in generic code. */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned (*patch)(u8 type, void *insnbuf, unsigned long addr, unsigned len); } __no_randomize_layout; - +#ifdef CONFIG_PARAVIRT_XXL struct pv_lazy_ops { /* Set deferred update mode, used for batching operations. */ void (*enter)(void); void (*leave)(void); void (*flush)(void); } __no_randomize_layout; +#endif struct pv_time_ops { unsigned long long (*sched_clock)(void); @@ -104,6 +107,9 @@ struct pv_time_ops { struct pv_cpu_ops { /* hooks for various privileged instructions */ + void (*io_delay)(void); + +#ifdef CONFIG_PARAVIRT_XXL unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); @@ -141,7 +147,6 @@ struct pv_cpu_ops { void (*set_iopl_mask)(unsigned mask); void (*wbinvd)(void); - void (*io_delay)(void); /* cpuid emulation, mostly so that caps bits can be disabled */ void (*cpuid)(unsigned int *eax, unsigned int *ebx, @@ -176,9 +181,11 @@ struct pv_cpu_ops { void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); +#endif } __no_randomize_layout; struct pv_irq_ops { +#ifdef CONFIG_PARAVIRT_XXL /* * Get/set interrupt state. save_fl and restore_fl are only * expected to use X86_EFLAGS_IF; all other bits @@ -195,35 +202,34 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); - +#endif } __no_randomize_layout; struct pv_mmu_ops { + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_one_user)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + const struct flush_tlb_info *info); + + void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); + + /* Hook for intercepting the destruction of an mm_struct. */ + void (*exit_mmap)(struct mm_struct *mm); + +#ifdef CONFIG_PARAVIRT_XXL unsigned long (*read_cr2)(void); void (*write_cr2)(unsigned long); unsigned long (*read_cr3)(void); void (*write_cr3)(unsigned long); - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ + /* Hooks for intercepting the creation/use of an mm_struct. */ void (*activate_mm)(struct mm_struct *prev, struct mm_struct *next); void (*dup_mmap)(struct mm_struct *oldmm, struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_one_user)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - const struct flush_tlb_info *info); - - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); @@ -298,6 +304,7 @@ struct pv_mmu_ops { an mfn. We can tell which is which from the index. */ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); +#endif } __no_randomize_layout; struct arch_spinlock; @@ -321,48 +328,31 @@ struct pv_lock_ops { * number for each function using the offset which we use to indicate * what to patch. */ struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; + struct pv_init_ops init; + struct pv_time_ops time; + struct pv_cpu_ops cpu; + struct pv_irq_ops irq; + struct pv_mmu_ops mmu; + struct pv_lock_ops lock; } __no_randomize_layout; extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; +extern struct paravirt_patch_template pv_ops; #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) + [paravirt_opptr] "i" (&(pv_ops.op)) #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - /* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") +#define paravirt_call \ + "PARAVIRT_CALL type=\"%c[paravirt_typenum]\"" \ + " clobber=\"%c[paravirt_clobber]\"" \ + " pv_opptr=\"%c[paravirt_opptr]\";" /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -373,34 +363,17 @@ extern struct pv_lock_ops pv_lock_ops; unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, +unsigned paravirt_patch_default(u8 type, void *insnbuf, unsigned long addr, unsigned len); unsigned paravirt_patch_insns(void *insnbuf, unsigned len, const char *start, const char *end); -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); +unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len); int paravirt_disable_iospace(void); /* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%c[paravirt_opptr];" - -/* * These macros are intended to wrap calls through one of the paravirt * ops structs, so that they can be later identified and patched at * runtime. @@ -510,9 +483,9 @@ int paravirt_disable_iospace(void); #endif /* CONFIG_X86_32 */ #ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#define PVOP_TEST_NULL(op) BUG_ON(pv_ops.op == NULL) #else -#define PVOP_TEST_NULL(op) ((void)op) +#define PVOP_TEST_NULL(op) ((void)pv_ops.op) #endif #define PVOP_RETMASK(rettype) \ @@ -537,7 +510,7 @@ int paravirt_disable_iospace(void); /* since this condition will never hold */ \ if (sizeof(rettype) > sizeof(unsigned long)) { \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -547,7 +520,7 @@ int paravirt_disable_iospace(void); __ret = (rettype)((((u64)__edx) << 32) | __eax); \ } else { \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -574,7 +547,7 @@ int paravirt_disable_iospace(void); PVOP_VCALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ + paravirt_call \ post \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ @@ -688,12 +661,31 @@ struct paravirt_patch_site { u8 *instr; /* original instructions */ u8 instrtype; /* type of this instruction */ u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ }; extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; +#else /* __ASSEMBLY__ */ + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +.macro PARAVIRT_CALL type:req clobber:req pv_opptr:req +771: ANNOTATE_RETPOLINE_SAFE + call *\pv_opptr +772: .pushsection .parainstructions,"a" + _ASM_ALIGN + _ASM_PTR 771b + .byte \type + .byte 772b-771b + .short \clobber + .popsection +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e9202a0de8f0..1a19d11cfbbd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -185,22 +185,22 @@ do { \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(1)",%0" \ + asm volatile(op "b "__percpu_arg(1)",%0"\ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(1)",%0" \ + asm volatile(op "w "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(1)",%0" \ + asm volatile(op "l "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(1)",%0" \ + asm volatile(op "q "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 12f54082f4c8..8bdf74902293 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -46,6 +46,14 @@ #define INTEL_ARCH_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT) +#define AMD64_L3_SLICE_SHIFT 48 +#define AMD64_L3_SLICE_MASK \ + ((0xFULL) << AMD64_L3_SLICE_SHIFT) + +#define AMD64_L3_THREAD_SHIFT 56 +#define AMD64_L3_THREAD_MASK \ + ((0xFFULL) << AMD64_L3_THREAD_SHIFT) + #define X86_RAW_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK | \ @@ -270,6 +278,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); +extern int x86_perf_rdpmc_index(struct perf_event *event); #else static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index fbd578daa66e..ec7f43327033 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -8,7 +8,7 @@ static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 24c6cf5f16b7..60d0f9015317 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -19,9 +19,6 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd); -#endif *pmdp = pmd; } @@ -61,9 +58,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0)); -#endif return __pmd(xchg((pmdval_t *)xp, 0)); } #else @@ -73,9 +67,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #ifdef CONFIG_SMP static inline pud_t native_pudp_get_and_clear(pud_t *xp) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0)); -#endif return __pud(xchg((pudval_t *)xp, 0)); } #else diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 858358a82b14..33845d36897c 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -20,7 +20,7 @@ typedef union { } pte_t; #endif /* !__ASSEMBLY__ */ -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \ (pv_info.shared_kernel_pmd))) #else diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e4ffa565a69f..40616e805292 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -55,9 +55,9 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); extern pmdval_t early_pmd_flags; -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> -#else /* !CONFIG_PARAVIRT */ +#else /* !CONFIG_PARAVIRT_XXL */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) @@ -112,8 +112,7 @@ extern pmdval_t early_pmd_flags; #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) - -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ /* * The following only work if pte_present() is true. @@ -1195,7 +1194,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, return xchg(pmdp, pmd); } else { pmd_t old = *pmdp; - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); return old; } } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f773d5e6c8cc..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> +#include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt @@ -55,15 +56,15 @@ struct mm_struct; void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) +static inline void native_set_pte(pte_t *ptep, pte_t pte) { - *ptep = native_make_pte(0); + WRITE_ONCE(*ptep, pte); } -static inline void native_set_pte(pte_t *ptep, pte_t pte) +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - *ptep = pte; + native_set_pte(ptep, native_make_pte(0)); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) @@ -73,7 +74,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); } static inline void native_pmd_clear(pmd_t *pmd) @@ -109,7 +110,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) static inline void native_set_pud(pud_t *pudp, pud_t pud) { - *pudp = pud; + WRITE_ONCE(*pudp, pud); } static inline void native_pud_clear(pud_t *pud) @@ -137,13 +138,13 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) pgd_t pgd; if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { - *p4dp = p4d; + WRITE_ONCE(*p4dp, p4d); return; } pgd = native_make_pgd(native_p4d_val(p4d)); pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); - *p4dp = native_make_p4d(native_pgd_val(pgd)); + WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd))); } static inline void native_p4d_clear(p4d_t *p4d) @@ -153,7 +154,7 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pti_set_user_pgtbl(pgdp, pgd); + WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd)); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b64acb08a62b..106b7d0e2dae 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -124,7 +124,7 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7f2dbd91fc74..90cb2f36c042 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -88,7 +88,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d53c54b842da..617805981cce 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -155,7 +155,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_CENTAUR 5 #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 -#define X86_VENDOR_NUM 9 +#define X86_VENDOR_HYGON 9 +#define X86_VENDOR_NUM 10 #define X86_VENDOR_UNKNOWN 0xff @@ -315,7 +316,13 @@ struct x86_hw_tss { */ u64 sp1; + /* + * Since Linux does not use ring 2, the 'sp2' slot is unused by + * hardware. entry_SYSCALL_64 uses it as scratch space to stash + * the user RSP value. + */ u64 sp2; + u64 reserved2; u64 ist[7]; u32 reserved3; @@ -578,7 +585,7 @@ static inline bool on_thread_stack(void) current_stack_pointer) < THREAD_SIZE; } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define __cpuid native_cpuid @@ -589,7 +596,7 @@ static inline void load_sp0(unsigned long sp0) } #define set_iopl_mask native_set_iopl_mask -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6de1fd3d0097..143c99499531 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -37,8 +37,10 @@ struct pt_regs { unsigned short __esh; unsigned short fs; unsigned short __fsh; + /* On interrupt, gs and __gsh store the vector number. */ unsigned short gs; unsigned short __gsh; + /* On interrupt, this is the error code. */ unsigned long orig_ax; unsigned long ip; unsigned short cs; @@ -144,7 +146,7 @@ static inline int v8086_mode(struct pt_regs *regs) static inline bool user_64bit_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_64 -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PARAVIRT_XXL /* * On non-paravirt systems, this is the only long mode CPL 3 * selector. We do not allow long mode selectors in the LDT. @@ -237,23 +239,51 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, } /** + * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns the address of the @n th entry of the + * kernel stack which is specified by @regs. If the @n th entry is NOT in + * the kernel stack, this returns NULL. + */ +static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return addr; + else + return NULL; +} + +/* To avoid include hell, we can't include uaccess.h */ +extern long probe_kernel_read(void *dst, const void *src, size_t size); + +/** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specified by @regs. If the @n th entry is NOT in the kernel stack, + * is specified by @regs. If the @n th entry is NOT in the kernel stack * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); - addr += n; - if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; - else - return 0; + unsigned long *addr; + unsigned long val; + long ret; + + addr = regs_get_kernel_stack_nth_addr(regs, n); + if (addr) { + ret = probe_kernel_read(&val, addr, sizeof(val)); + if (!ret) + return val; + } + return 0; } #define arch_has_single_step() (1) @@ -263,7 +293,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif -#define ARCH_HAS_USER_SINGLE_STEP_INFO +#define ARCH_HAS_USER_SINGLE_STEP_REPORT /* * When hitting ptrace_stop(), we cannot return using SYSRET because diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 3e70bed8a978..87623c6b13db 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -6,9 +6,24 @@ #include <asm/cpufeature.h> #include <asm-generic/qspinlock_types.h> #include <asm/paravirt.h> +#include <asm/rmwcc.h> #define _Q_PENDING_LOOPS (1 << 9) +#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + u32 val = 0; + + if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c, + "I", _Q_PENDING_OFFSET)) + val |= _Q_PENDING_VAL; + + val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK; + + return val; +} + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 19b90521954c..a8b5e1e13319 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -4,6 +4,41 @@ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from * PaX/grsecurity. */ + +#ifdef __ASSEMBLY__ + +#include <asm/asm.h> +#include <asm/bug.h> + +.macro REFCOUNT_EXCEPTION counter:req + .pushsection .text..refcount +111: lea \counter, %_ASM_CX +112: ud2 + ASM_UNREACHABLE + .popsection +113: _ASM_EXTABLE_REFCOUNT(112b, 113b) +.endm + +/* Trigger refcount exception if refcount result is negative. */ +.macro REFCOUNT_CHECK_LT_ZERO counter:req + js 111f + REFCOUNT_EXCEPTION counter="\counter" +.endm + +/* Trigger refcount exception if refcount result is zero or negative. */ +.macro REFCOUNT_CHECK_LE_ZERO counter:req + jz 111f + REFCOUNT_CHECK_LT_ZERO counter="\counter" +.endm + +/* Trigger refcount exception unconditionally. */ +.macro REFCOUNT_ERROR counter:req + jmp 111f + REFCOUNT_EXCEPTION counter="\counter" +.endm + +#else /* __ASSEMBLY__ */ + #include <linux/refcount.h> #include <asm/bug.h> @@ -15,34 +50,11 @@ * central refcount exception. The fixup address for the exception points * back to the regular execution flow in .text. */ -#define _REFCOUNT_EXCEPTION \ - ".pushsection .text..refcount\n" \ - "111:\tlea %[counter], %%" _ASM_CX "\n" \ - "112:\t" ASM_UD2 "\n" \ - ASM_UNREACHABLE \ - ".popsection\n" \ - "113:\n" \ - _ASM_EXTABLE_REFCOUNT(112b, 113b) - -/* Trigger refcount exception if refcount result is negative. */ -#define REFCOUNT_CHECK_LT_ZERO \ - "js 111f\n\t" \ - _REFCOUNT_EXCEPTION - -/* Trigger refcount exception if refcount result is zero or negative. */ -#define REFCOUNT_CHECK_LE_ZERO \ - "jz 111f\n\t" \ - REFCOUNT_CHECK_LT_ZERO - -/* Trigger refcount exception unconditionally. */ -#define REFCOUNT_ERROR \ - "jmp 111f\n\t" \ - _REFCOUNT_EXCEPTION static __always_inline void refcount_add(unsigned int i, refcount_t *r) { asm volatile(LOCK_PREFIX "addl %1,%0\n\t" - REFCOUNT_CHECK_LT_ZERO + "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : "ir" (i) : "cc", "cx"); @@ -51,7 +63,7 @@ static __always_inline void refcount_add(unsigned int i, refcount_t *r) static __always_inline void refcount_inc(refcount_t *r) { asm volatile(LOCK_PREFIX "incl %0\n\t" - REFCOUNT_CHECK_LT_ZERO + "REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : : "cc", "cx"); } @@ -59,7 +71,7 @@ static __always_inline void refcount_inc(refcount_t *r) static __always_inline void refcount_dec(refcount_t *r) { asm volatile(LOCK_PREFIX "decl %0\n\t" - REFCOUNT_CHECK_LE_ZERO + "REFCOUNT_CHECK_LE_ZERO counter=\"%[counter]\"" : [counter] "+m" (r->refs.counter) : : "cc", "cx"); } @@ -67,14 +79,17 @@ static __always_inline void refcount_dec(refcount_t *r) static __always_inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) { - GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "er", i, "%0", e, "cx"); + + return GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", + "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"", + r->refs.counter, e, "er", i, "cx"); } static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) { - GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "%0", e, "cx"); + return GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", + "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"", + r->refs.counter, e, "cx"); } static __always_inline __must_check @@ -91,7 +106,7 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r) /* Did we try to increment from/to an undesirable state? */ if (unlikely(c < 0 || c == INT_MAX || result < c)) { - asm volatile(REFCOUNT_ERROR + asm volatile("REFCOUNT_ERROR counter=\"%[counter]\"" : : [counter] "m" (r->refs.counter) : "cc", "cx"); break; @@ -107,4 +122,6 @@ static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) return refcount_add_not_zero(1, r); } +#endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4914a3e7c803..46ac84b506f5 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,56 +2,69 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +/* This counts to 12. Any more, it will return 13th argument. */ +#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n +#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +#define __RMWcc_CONCAT(a, b) a ## b +#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b) + #define __CLOBBERS_MEM(clb...) "memory", ## clb #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ -do { \ +#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ +({ \ + bool c = false; \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : [counter] "m" (var), ## __VA_ARGS__ \ + : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ - return 0; \ -cc_label: \ - return 1; \ -} while (0) - -#define __BINARY_RMWcc_ARG " %1, " - + if (0) { \ +cc_label: c = true; \ + } \ + c; \ +}) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ -do { \ +#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ +({ \ bool c; \ asm volatile (fullop CC_SET(cc) \ - : [counter] "+m" (var), CC_OUT(cc) (c) \ + : [var] "+m" (_var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ - return c; \ -} while (0) - -#define __BINARY_RMWcc_ARG " %2, " + c; \ +}) #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ +#define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) -#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\ - __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM(clobbers)) +#define GEN_UNARY_RMWcc_3(op, var, cc) \ + GEN_UNARY_RMWcc_4(op, var, cc, "%[var]") -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ - __CLOBBERS_MEM(), vcon (val)) +#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X) + +#define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \ + __GEN_RMWcc(op " %[val], " arg0, var, cc, \ + __CLOBBERS_MEM(), [val] vcon (_val)) + +#define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \ + GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]") + +#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \ + __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \ + __CLOBBERS_MEM(clobbers)) -#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc, \ - clobbers...) \ - __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM(clobbers), vcon (val)) +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, cc, vcon, _val, clobbers...)\ + __GEN_RMWcc(op " %[val], %[var]\n\t" suffix, var, cc, \ + __CLOBBERS_MEM(clobbers), [val] vcon (_val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 4a911a382ade..8ea1cfdbeabc 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -11,7 +11,6 @@ extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; -extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index e293c122d0d5..ac3892920419 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -186,8 +186,7 @@ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* Abused to load per CPU data from limit */ -#define GDT_ENTRY_PER_CPU 15 +#define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: @@ -207,11 +206,11 @@ #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) +#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PARAVIRT_XXL # define get_kernel_rpl() 0 #endif @@ -225,6 +224,47 @@ #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) +#ifdef CONFIG_X86_64 + +/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ +#define VDSO_CPUNODE_BITS 12 +#define VDSO_CPUNODE_MASK 0xfff + +#ifndef __ASSEMBLY__ + +/* Helper functions to store/load CPU and node numbers */ + +static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) +{ + return (node << VDSO_CPUNODE_BITS) | cpu; +} + +static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) +{ + unsigned int p; + + /* + * Load CPU and node number from the GDT. LSL is faster than RDTSCP + * and works on all CPUs. This is volatile so that it orders + * correctly with respect to barrier() and to keep GCC from cleverly + * hoisting it out of the calling function. + * + * If RDPID is available, use it. + */ + alternative_io ("lsl %[seg],%[p]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + + if (cpu) + *cpu = (p & VDSO_CPUNODE_MASK); + if (node) + *node = (p >> VDSO_CPUNODE_BITS); +} + +#endif /* !__ASSEMBLY__ */ +#endif /* CONFIG_X86_64 */ + #ifdef __KERNEL__ /* diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 317fc59b512c..43c029cdc3fe 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -141,7 +141,7 @@ static inline unsigned long __read_cr4(void) return native_read_cr4(); } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else @@ -208,7 +208,7 @@ static inline void load_gs_index(unsigned selector) #endif -#endif/* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ static inline void clflush(volatile void *__p) { diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index d33f92b9fa22..7ad41bfcc16c 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 -void memcpy_flushcache(void *dst, const void *src, size_t cnt); +void __memcpy_flushcache(void *dst, const void *src, size_t cnt); +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + if (__builtin_constant_p(cnt)) { + switch (cnt) { + case 4: + asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src)); + return; + case 8: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + return; + case 16: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8))); + return; + } + } + __memcpy_flushcache(dst, src, cnt); +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index ecffe81ff65c..a892494ca5e4 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -4,3 +4,11 @@ #else # include <asm/suspend_64.h> #endif +extern unsigned long restore_jump_address __visible; +extern unsigned long jump_address_phys; +extern unsigned long restore_cr3 __visible; +extern unsigned long temp_pgt __visible; +extern unsigned long relocated_restore_code __visible; +extern int relocate_restore_code(void); +/* Defined in hibernate_asm_32/64.S */ +extern asmlinkage __visible int restore_image(void); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 8be6afb58471..fdbd9d7b7bca 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -32,4 +32,8 @@ struct saved_context { unsigned long return_address; } __attribute__((packed)); +/* routines for saving/restoring kernel state */ +extern char core_restore_code[]; +extern char restore_registers[]; + #endif /* _ASM_X86_SUSPEND_32_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index cb0a1f470980..404b8b1d44f5 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,16 +6,23 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) \ -{ \ - if (!tlb->fullmm && !tlb->need_flush_all) \ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ - else \ - flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ -} +static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> +static inline void tlb_flush(struct mmu_gather *tlb) +{ + unsigned long start = 0UL, end = TLB_FLUSH_ALL; + unsigned int stride_shift = tlb_get_unmap_shift(tlb); + + if (!tlb->fullmm && !tlb->need_flush_all) { + start = tlb->start; + end = tlb->end; + } + + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); +} + /* * While x86 architecture in general requires an IPI to perform TLB * shootdown, enablement code for several hypervisors overrides diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 58ce5288878e..323a313947e0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* - * If we have PCID, then switching to init_mm is reasonably - * fast. If we don't have PCID, then switching to init_mm is - * quite slow, so we try to defer it in the hopes that we can - * avoid it entirely. The latter approach runs the risk of - * receiving otherwise unnecessary IPIs. - * - * This choice is just a heuristic. The tlb code can handle this - * function returning true or false regardless of whether we have - * PCID. - */ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -547,23 +531,30 @@ struct flush_tlb_info { unsigned long start; unsigned long end; u64 new_tlb_gen; + unsigned int stride_shift; + bool freed_tables; }; #define local_flush_tlb() __flush_tlb() -#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) +#define flush_tlb_mm(mm) \ + flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) -#define flush_tlb_range(vma, start, end) \ - flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) +#define flush_tlb_range(vma, start, end) \ + flush_tlb_mm_range((vma)->vm_mm, start, end, \ + ((vma)->vm_flags & VM_HUGETLB) \ + ? huge_page_shift(hstate_vma(vma)) \ + : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag); + unsigned long end, unsigned int stride_shift, + bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { - flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 7bd92db09e8d..54133017267c 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -11,12 +11,12 @@ TRACE_EVENT(mpx_bounds_register_exception, - TP_PROTO(void *addr_referenced, + TP_PROTO(void __user *addr_referenced, const struct mpx_bndreg *bndreg), TP_ARGS(addr_referenced, bndreg), TP_STRUCT__entry( - __field(void *, addr_referenced) + __field(void __user *, addr_referenced) __field(u64, lower_bound) __field(u64, upper_bound) ), diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index aae77eb8491c..b5e58cc0c5e7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -198,8 +198,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) "4: movl %3,%0\n" \ " jmp 3b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=r" (err) \ : "A" (x), "r" (addr), "i" (errret), "0" (err)) @@ -340,8 +340,8 @@ do { \ " xorl %%edx,%%edx\n" \ " jmp 3b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 4b) \ + _ASM_EXTABLE_UA(2b, 4b) \ : "=r" (retval), "=&A"(x) \ : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ @@ -386,7 +386,7 @@ do { \ " xor"itype" %"rtype"1,%"rtype"1\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) @@ -398,7 +398,7 @@ do { \ "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) @@ -474,7 +474,7 @@ struct __large_struct { unsigned long buf[100]; }; "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "=r"(err) \ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) @@ -602,7 +602,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "q" (__new), "1" (__old) \ : "memory" \ @@ -618,7 +618,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ @@ -634,7 +634,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ @@ -653,7 +653,7 @@ extern void __cmpxchg_wrong_size(void) "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ "\t.previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UA(1b, 3b) \ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ : "i" (-EFAULT), "r" (__new), "1" (__old) \ : "memory" \ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 51c4eee00732..dc4ed8bc2382 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -24,6 +24,7 @@ # include <asm/unistd_64.h> # include <asm/unistd_64_x32.h> # define __ARCH_WANT_COMPAT_SYS_TIME +# define __ARCH_WANT_SYS_UTIME32 # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 @@ -31,13 +32,13 @@ # endif +# define __ARCH_WANT_NEW_STAT # define __ARCH_WANT_OLD_READDIR # define __ARCH_WANT_OLD_STAT # define __ARCH_WANT_SYS_ALARM # define __ARCH_WANT_SYS_FADVISE64 # define __ARCH_WANT_SYS_GETHOSTNAME # define __ARCH_WANT_SYS_GETPGRP -# define __ARCH_WANT_SYS_LLSEEK # define __ARCH_WANT_SYS_NICE # define __ARCH_WANT_SYS_OLDUMOUNT # define __ARCH_WANT_SYS_OLD_GETRLIMIT diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index a80c0673798f..e60c45fd3679 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -10,8 +10,13 @@ struct cpumask; struct mm_struct; #ifdef CONFIG_X86_UV +#include <linux/efi.h> extern enum uv_system_type get_uv_system_type(void); +static inline bool is_early_uv_system(void) +{ + return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab); +} extern int is_uv_system(void); extern int is_uv_hubless(void); extern void uv_cpu_init(void); @@ -23,6 +28,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, #else /* X86_UV */ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } +static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubless(void) { return 0; } static inline void uv_cpu_init(void) { } diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 53748541c487..913a133f8e6f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,33 +5,46 @@ #include <linux/compiler.h> #include <linux/clocksource.h> +#include <uapi/linux/time.h> + #ifdef BUILD_VDSO32_64 typedef u64 gtod_long_t; #else typedef unsigned long gtod_long_t; #endif + +/* + * There is one of these objects in the vvar page for each + * vDSO-accelerated clockid. For high-resolution clocks, this encodes + * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse + * clocks, this encodes the actual time. + * + * To confuse the reader, for high-resolution clocks, nsec is left-shifted + * by vsyscall_gtod_data.shift. + */ +struct vgtod_ts { + u64 sec; + u64 nsec; +}; + +#define VGTOD_BASES (CLOCK_TAI + 1) +#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI)) +#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE)) + /* * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time * so be carefull by modifying this structure. */ struct vsyscall_gtod_data { - unsigned seq; - - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - - /* open coded 'struct timespec' */ - u64 wall_time_snsec; - gtod_long_t wall_time_sec; - gtod_long_t monotonic_time_sec; - u64 monotonic_time_snsec; - gtod_long_t wall_time_coarse_sec; - gtod_long_t wall_time_coarse_nsec; - gtod_long_t monotonic_time_coarse_sec; - gtod_long_t monotonic_time_coarse_nsec; + unsigned int seq; + + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; + + struct vgtod_ts basetime[VGTOD_BASES]; int tz_minuteswest; int tz_dsttime; @@ -44,9 +57,9 @@ static inline bool vclock_was_used(int vclock) return READ_ONCE(vclocks_used) & (1 << vclock); } -static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) +static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s) { - unsigned ret; + unsigned int ret; repeat: ret = READ_ONCE(s->seq); @@ -59,7 +72,7 @@ repeat: } static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, - unsigned start) + unsigned int start) { smp_rmb(); return unlikely(s->seq != start); @@ -77,30 +90,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) -{ - unsigned int p; - - /* - * Load per CPU data from GDT. LSL is faster than RDTSCP and - * works on all CPUs. This is volatile so that it orders - * correctly wrt barrier() and to keep gcc from cleverly - * hoisting it out of the calling function. - * - * If RDPID is available, use it. - */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ - X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); - - return p; -} - -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 0116b2ee9e64..e05e0d309244 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -83,9 +83,10 @@ static inline void cpu_emergency_vmxoff(void) */ static inline int cpu_has_svm(const char **msg) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { if (msg) - *msg = "not amd"; + *msg = "not amd or hygon"; return 0; } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b85a7c54c6a1..0f842104862c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -303,4 +303,6 @@ extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); extern bool x86_pnpbios_disabled(void); +void x86_verify_bootdata_version(void); + #endif diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index d383140e1dc8..068d9b067c83 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_XEN_EVENTS_H #define _ASM_X86_XEN_EVENTS_H +#include <xen/xen.h> + enum ipi_vector { XEN_RESCHEDULE_VECTOR, XEN_CALL_FUNCTION_VECTOR, diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index a06cbf019744..22f89d040ddd 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -16,6 +16,9 @@ #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 +/* version flags */ +#define VERSION_WRITTEN 0x8000 + /* loadflags */ #define LOADED_HIGH (1<<0) #define KASLR_FLAG (1<<1) @@ -86,6 +89,7 @@ struct setup_header { __u64 pref_address; __u32 init_size; __u32 handover_offset; + __u64 acpi_rsdp_addr; } __attribute__((packed)); struct sys_desc_table { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..fd23d5778ea1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -377,6 +377,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/include/uapi/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h index b3d157957177..6642d8be40c4 100644 --- a/arch/x86/include/uapi/asm/siginfo.h +++ b/arch/x86/include/uapi/asm/siginfo.h @@ -7,8 +7,6 @@ typedef long long __kernel_si_clock_t __attribute__((aligned(4))); # define __ARCH_SI_CLOCK_T __kernel_si_clock_t # define __ARCH_SI_ATTRIBUTES __attribute__((aligned(8))) -# else /* x86-64 */ -# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) # endif #endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3b20607d581b..e8fea7ffa306 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -48,6 +48,7 @@ #include <asm/mpspec.h> #include <asm/smp.h> #include <asm/i8259.h> +#include <asm/setup.h> #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; @@ -1771,3 +1772,8 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) e820__range_add(addr, size, E820_TYPE_ACPI); e820__update_table_print(); } + +u64 x86_default_get_root_pointer(void) +{ + return boot_params.hdr.acpi_rsdp_addr; +} diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b9d5e7c9ef43..ebeac487a20c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -222,6 +222,10 @@ void __init arch_init_ideal_nops(void) } break; + case X86_VENDOR_HYGON: + ideal_nops = p6_nops; + return; + case X86_VENDOR_AMD: if (boot_cpu_data.x86 > 0xf) { ideal_nops = p6_nops; @@ -594,7 +598,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insnbuf, p->instr, p->len); - used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + used = pv_ops.init.patch(p->instrtype, insnbuf, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index f299d8a479bb..3f9d1b4019bb 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -482,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, { void *vaddr; - vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); + vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs); if (!vaddr || !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) return vaddr; @@ -494,7 +494,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, goto out_free; return vaddr; out_free: - dma_direct_free(dev, size, vaddr, *dma_addr, attrs); + dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -504,7 +504,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_direct_free(dev, size, vaddr, dma_addr, attrs); + dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b481b95bd8f6..a6eca647bc76 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -61,6 +61,21 @@ static const struct pci_device_id amd_nb_link_ids[] = { {} }; +static const struct pci_device_id hygon_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + +const struct pci_device_id hygon_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + {} +}; + +static const struct pci_device_id hygon_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + {} +}; + const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { 0x00, 0x18, 0x20 }, { 0xff, 0x00, 0x20 }, @@ -194,15 +209,24 @@ EXPORT_SYMBOL_GPL(amd_df_indirect_read); int amd_cache_northbridges(void) { - u16 i = 0; - struct amd_northbridge *nb; + const struct pci_device_id *misc_ids = amd_nb_misc_ids; + const struct pci_device_id *link_ids = amd_nb_link_ids; + const struct pci_device_id *root_ids = amd_root_ids; struct pci_dev *root, *misc, *link; + struct amd_northbridge *nb; + u16 i = 0; if (amd_northbridges.num) return 0; + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + root_ids = hygon_root_ids; + misc_ids = hygon_nb_misc_ids; + link_ids = hygon_nb_link_ids; + } + misc = NULL; - while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids)) != NULL) i++; if (!i) @@ -218,11 +242,11 @@ int amd_cache_northbridges(void) link = misc = root = NULL; for (i = 0; i != amd_northbridges.num; i++) { node_to_amd_nb(i)->root = root = - next_northbridge(root, amd_root_ids); + next_northbridge(root, root_ids); node_to_amd_nb(i)->misc = misc = - next_northbridge(misc, amd_nb_misc_ids); + next_northbridge(misc, misc_ids); node_to_amd_nb(i)->link = link = - next_northbridge(link, amd_nb_link_ids); + next_northbridge(link, link_ids); } if (amd_gart_present()) @@ -261,11 +285,19 @@ EXPORT_SYMBOL_GPL(amd_cache_northbridges); */ bool __init early_is_amd_nb(u32 device) { + const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *id; u32 vendor = device & 0xffff; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + misc_ids = hygon_nb_misc_ids; + device >>= 16; - for (id = amd_nb_misc_ids; id->vendor; id++) + for (id = misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return true; return false; @@ -277,7 +309,8 @@ struct resource *amd_get_mmconfig_range(struct resource *res) u64 base, msr; unsigned int segn_busn_bits; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return NULL; /* assume all cpus from fam10h have mmconfig */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 84132eddb5a8..ab731ab09f06 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -224,6 +224,11 @@ static int modern_apic(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xf) return 1; + + /* Hygon systems use modern APIC */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return 1; + return lapic_get_version() >= 0x14; } @@ -1912,6 +1917,8 @@ static int __init detect_init_APIC(void) (boot_cpu_data.x86 >= 15)) break; goto no_apic; + case X86_VENDOR_HYGON: + break; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 02e8acb134f8..47ff2976c292 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -185,6 +185,7 @@ void __init default_setup_apic_routing(void) break; } /* If P4 and above fall through */ + case X86_VENDOR_HYGON: case X86_VENDOR_AMD: def_to_bigsmp = 1; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 9f148e3d45b4..652e7ffa9b9d 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -313,14 +313,13 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) struct apic_chip_data *apicd = apic_chip_data(irqd); int vector, cpu; - cpumask_and(vector_searchmask, vector_searchmask, affmsk); - cpu = cpumask_first(vector_searchmask); - if (cpu >= nr_cpu_ids) - return -EINVAL; + cpumask_and(vector_searchmask, dest, affmsk); + /* set_affinity might call here for nothing */ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) return 0; - vector = irq_matrix_alloc_managed(vector_matrix, cpu); + vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask, + &cpu); trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; @@ -413,7 +412,7 @@ static int activate_managed(struct irq_data *irqd) if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { /* Something in the core code broke! Survive gracefully */ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); - return EINVAL; + return -EINVAL; } ret = assign_managed_vector(irqd, vector_searchmask); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index ec00d1ff5098..f7151cd03cb0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1640,6 +1640,7 @@ static int do_open(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS static int proc_apm_show(struct seq_file *m, void *v) { unsigned short bx; @@ -1719,6 +1720,7 @@ static int proc_apm_show(struct seq_file *m, void *v) units); return 0; } +#endif static int apm(void *unused) { diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 01de31db300d..72adf6c335dc 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -64,15 +64,12 @@ void common(void) { OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL BLANK(); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); + OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); + OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); + OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); #endif #ifdef CONFIG_XEN @@ -99,13 +96,12 @@ void common(void) { OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); - /* Offset for sp0 and sp1 into the tss_struct */ + /* Offset for fields in tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 3b9405e7ba2b..ddced33184b5 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -21,10 +21,13 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_PARAVIRT_XXL + OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, + cpu.usergs_sysret64); + OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY - OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); + OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); +#endif #endif BLANK(); #endif diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 33399426793e..1979a76bfadd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/sched.h> #include <linux/kthread.h> @@ -31,11 +34,17 @@ static __init int set_corruption_check(char *arg) ssize_t ret; unsigned long val; + if (!arg) { + pr_err("memory_corruption_check config string not provided\n"); + return -EINVAL; + } + ret = kstrtoul(arg, 10, &val); if (ret) return ret; memory_corruption_check = val; + return 0; } early_param("memory_corruption_check", set_corruption_check); @@ -45,6 +54,11 @@ static __init int set_corruption_check_period(char *arg) ssize_t ret; unsigned long val; + if (!arg) { + pr_err("memory_corruption_check_period config string not provided\n"); + return -EINVAL; + } + ret = kstrtoul(arg, 10, &val); if (ret) return ret; @@ -59,6 +73,11 @@ static __init int set_corruption_check_size(char *arg) char *end; unsigned size; + if (!arg) { + pr_err("memory_corruption_check_size config string not provided\n"); + return -EINVAL; + } + size = memparse(arg, &end); if (*end == '\0') @@ -113,7 +132,7 @@ void __init setup_bios_corruption_check(void) } if (num_scan_areas) - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); + pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas); } @@ -132,8 +151,7 @@ void check_for_bios_corruption(void) for (; size; addr++, size -= sizeof(unsigned long)) { if (!*addr) continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); + pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr); corruption = 1; *addr = 0; } @@ -157,11 +175,11 @@ static int start_periodic_check_for_corruption(void) if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) return 0; - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); + pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period); /* First time we run the checks right away */ schedule_delayed_work(&bios_check_work, 0); + return 0; } device_initcall(start_periodic_check_for_corruption); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 347137e80bf5..1f5d2291c31e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22ab408177b2..eeea634bee0a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -922,7 +922,7 @@ static void init_amd(struct cpuinfo_x86 *c) static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ - if ((c->x86 == 6)) { + if (c->x86 == 6) { /* Duron Rev A0 */ if (c->x86_model == 3 && c->x86_stepping == 0) size = 64; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 40bdaea97fe7..c37e66e493bf 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -35,12 +35,10 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); -/* - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any - * writes to SPEC_CTRL contain whatever reserved bits have been set. - */ -u64 __ro_after_init x86_spec_ctrl_base; +/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); +static DEFINE_MUTEX(spec_ctrl_mutex); /* * The vendor and possibly platform specific bits which can be modified in @@ -312,6 +310,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); return SPECTRE_V2_CMD_AUTO; @@ -325,6 +324,46 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static bool stibp_needed(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE) + return false; + + if (!boot_cpu_has(X86_FEATURE_STIBP)) + return false; + + return true; +} + +static void update_stibp_msr(void *info) +{ + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); +} + +void arch_smt_update(void) +{ + u64 mask; + + if (!stibp_needed()) + return; + + mutex_lock(&spec_ctrl_mutex); + mask = x86_spec_ctrl_base; + if (cpu_smt_control == CPU_SMT_ENABLED) + mask |= SPEC_CTRL_STIBP; + else + mask &= ~SPEC_CTRL_STIBP; + + if (mask != x86_spec_ctrl_base) { + pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", + cpu_smt_control == CPU_SMT_ENABLED ? + "Enabling" : "Disabling"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); + } + mutex_unlock(&spec_ctrl_mutex); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -371,7 +410,8 @@ static void __init spectre_v2_select_mitigation(void) return; retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { retpoline_amd: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); @@ -424,6 +464,9 @@ specv2_set_mode: setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } + + /* Enable STIBP if appropriate */ + arch_smt_update(); } #undef pr_fmt @@ -814,6 +857,8 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { + int ret; + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); @@ -831,10 +876,13 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "Mitigation: __user pointer sanitization\n"); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", spectre_v2_module_string()); + return ret; case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 0c5fcbd998cf..dc1b9342e9c4 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -602,6 +602,10 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) else amd_cpuid4(index, &eax, &ebx, &ecx); amd_init_l3_cache(this_leaf, index); + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -625,7 +629,8 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) union _cpuid4_leaf_eax cache_eax; int i = -1; - if (c->x86_vendor == X86_VENDOR_AMD) + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) op = 0x8000001d; else op = 4; @@ -678,6 +683,22 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) } } +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; +} + void init_amd_cacheinfo(struct cpuinfo_x86 *c) { @@ -691,6 +712,11 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) } } +void init_hygon_cacheinfo(struct cpuinfo_x86 *c) +{ + num_cache_leaves = find_num_cache_leaves(c); +} + void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -913,7 +939,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) { if (__cache_amd_cpumap_setup(cpu, index, base)) return; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 44c4ef3d989b..660d0b22e962 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -949,11 +949,11 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) } static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY }, { X86_VENDOR_CENTAUR, 5 }, { X86_VENDOR_INTEL, 5 }, { X86_VENDOR_NSC, 5 }, @@ -963,15 +963,16 @@ static const __initconst struct x86_cpu_id cpu_no_speculation[] = { static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { { X86_VENDOR_AMD }, + { X86_VENDOR_HYGON }, {} }; /* Only list CPUs which speculate but are non susceptible to SSB */ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, @@ -984,14 +985,14 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, {} @@ -1076,6 +1077,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); @@ -1093,7 +1097,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } else { - identify_cpu_without_cpuid(c); setup_clear_cpu_cap(X86_FEATURE_CPUID); } @@ -1240,10 +1243,10 @@ static void generic_identify(struct cpuinfo_x86 *c) * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT +# ifdef CONFIG_PARAVIRT_XXL do { extern void native_iret(void); - if (pv_cpu_ops.iret == native_iret) + if (pv_ops.cpu.iret == native_iret) set_cpu_bug(c, X86_BUG_ESPFIX); } while (0); # else @@ -1531,19 +1534,8 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); /* May not be marked __init: used by software suspend */ void syscall_init(void) { - extern char _entry_trampoline[]; - extern char entry_SYSCALL_64_trampoline[]; - - int cpu = smp_processor_id(); - unsigned long SYSCALL64_entry_trampoline = - (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + - (entry_SYSCALL_64_trampoline - _entry_trampoline); - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - if (static_cpu_has(X86_FEATURE_PTI)) - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); - else - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1554,7 +1546,8 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1669,6 +1662,29 @@ static void wait_for_master_cpu(int cpu) #endif } +#ifdef CONFIG_X86_64 +static void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (static_cpu_has(X86_FEATURE_RDTSCP)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1706,6 +1722,7 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif + setup_getcpu(cpu); me = current; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 7b229afa0a37..da5446acc241 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -54,6 +54,7 @@ extern u32 get_scattered_cpuid_leaf(unsigned int level, enum cpuid_regs_idx reg); extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); +extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); extern int detect_extended_topology_early(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 8949b7ae6d92..d12226f60168 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -437,7 +437,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c) /* enable MAPEN */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable cpuid */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* disable MAPEN */ setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c new file mode 100644 index 000000000000..cf25405444ab --- /dev/null +++ b/arch/x86/kernel/cpu/hygon.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Hygon Processor Support for Linux + * + * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd. + * + * Author: Pu Wen <puwen@hygon.cn> + */ +#include <linux/io.h> + +#include <asm/cpu.h> +#include <asm/smp.h> +#include <asm/cacheinfo.h> +#include <asm/spec-ctrl.h> +#include <asm/delay.h> +#ifdef CONFIG_X86_64 +# include <asm/set_memory.h> +#endif + +#include "cpu.h" + +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +static void hygon_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + +/* + * Fixup core topology information for + * (1) Hygon multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) Hygon processors supporting compute units + */ +static void hygon_get_topology(struct cpuinfo_x86 *c) +{ + u8 node_id; + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + node_id = ecx & 0xff; + + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + cacheinfo_hygon_init_llc_id(c, cpu, node_id); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + node_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = node_id; + } else + return; + + if (nodes_per_socket > 1) + set_cpu_cap(c, X86_FEATURE_AMD_DCM); +} + +/* + * On Hygon setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void hygon_detect_cmp(struct cpuinfo_x86 *c) +{ + unsigned int bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +} + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned int apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs. + * Assume they are all increased by a constant offset, but + * in the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_hygon_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_hygon(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + pr_debug("tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } +#endif + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); + } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes_per_socket = ((value >> 3) & 7) + 1; + } + + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; + } + } +} + +static void early_init_hygon(struct cpuinfo_x86 *c) +{ + u32 dummy; + + early_init_hygon_mc(c); + + set_cpu_cap(c, X86_FEATURE_K8); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* + * ApicID can always be treated as an 8-bit value for Hygon APIC So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally. + */ + if (boot_cpu_has(X86_FEATURE_APIC)) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); +#endif + + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + + hygon_get_topology_early(c); +} + +static void init_hygon(struct cpuinfo_x86 *c) +{ + early_init_hygon(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); + + set_cpu_cap(c, X86_FEATURE_ZEN); + set_cpu_cap(c, X86_FEATURE_CPB); + + cpu_detect_cache_sizes(c); + + hygon_detect_cmp(c); + hygon_get_topology(c); + srat_detect_node(c); + + init_hygon_cacheinfo(c); + + if (cpu_has(c, X86_FEATURE_XMM2)) { + unsigned long long val; + int ret; + + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } + } + + /* + * Hygon processors have APIC timer running in deep C states. + */ + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); +} + +static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + +static const struct cpu_dev hygon_cpu_dev = { + .c_vendor = "Hygon", + .c_ident = { "HygonGenuine" }, + .c_early_init = early_init_hygon, + .c_detect_tlb = cpu_detect_tlb_hygon, + .c_bsp_init = bsp_init_hygon, + .c_init = init_hygon, + .c_x86_vendor = X86_VENDOR_HYGON, +}; + +cpu_dev_register(hygon_cpu_dev); diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index abb71ac70443..44272b7107ad 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -485,9 +485,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), - sizeof(unsigned long), - GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); @@ -496,7 +494,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_total) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); return -ENOMEM; } } @@ -504,7 +502,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); if (!d->mbm_local) { - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); return -ENOMEM; } @@ -610,9 +608,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + /* + * rdt_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + kfree(d->ctrl_val); kfree(d->mbps_val); - kfree(d->rmid_busy_llc); + bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); kfree(d); diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..3736f6dc9545 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -521,14 +529,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - u32 _cbm, int closid, bool exclusive); + unsigned long cbm, int closid, bool exclusive); unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, - u32 cbm); + unsigned long cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); int rdt_pseudo_lock_init(void); void rdt_pseudo_lock_release(void); @@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..27937458c231 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; @@ -403,8 +404,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, for_each_alloc_enabled_rdt_resource(r) seq_printf(s, "%s:uninitialized\n", r->name); } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, - rdtgrp->plr->d->id, rdtgrp->plr->cbm); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->r->name, + rdtgrp->plr->d->id, + rdtgrp->plr->cbm); + } } else { closid = rdtgrp->closid; for_each_alloc_enabled_rdt_resource(r) { diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 40f3903ae5d9..815b4e92522c 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/mman.h> +#include <linux/perf_event.h> #include <linux/pm_qos.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -26,6 +27,7 @@ #include <asm/intel_rdt_sched.h> #include <asm/perf_event.h> +#include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "intel_rdt.h" #define CREATE_TRACE_POINTS @@ -91,7 +93,7 @@ static u64 get_prefetch_disable_bits(void) */ return 0xF; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void) return 0; } -/* - * Helper to write 64bit value to MSR without tracing. Used when - * use of the cache should be restricted and use of registers used - * for local variables avoided. - */ -static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) -{ - __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); -} - /** * pseudo_lock_minor_get - Obtain available minor number * @minor: Pointer to where new minor number will be stored @@ -797,25 +789,27 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) /** * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked * @d: RDT domain - * @_cbm: CBM to test + * @cbm: CBM to test * - * @d represents a cache instance and @_cbm a capacity bitmask that is - * considered for it. Determine if @_cbm overlaps with any existing + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing * pseudo-locked region on @d. * - * Return: true if @_cbm overlaps with pseudo-locked region on @d, false + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false * otherwise. */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) { - unsigned long *cbm = (unsigned long *)&_cbm; - unsigned long *cbm_b; unsigned int cbm_len; + unsigned long cbm_b; if (d->plr) { cbm_len = d->plr->r->cache.cbm_len; - cbm_b = (unsigned long *)&d->plr->cbm; - if (bitmap_intersects(cbm, cbm_b, cbm_len)) + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) return true; } return false; @@ -886,31 +880,14 @@ static int measure_cycles_lat_fn(void *_plr) struct pseudo_lock_region *plr = _plr; unsigned long i; u64 start, end; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use a regular - * variable to ensure we always use a valid pointer to access memory. - * The cost is that accessing this pointer, which could be in - * cache, will be included in the measurement of memory read latency. - */ void *mem_r; -#else -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ local_irq_disable(); /* - * The wrmsr call may be reordered with the assignment below it. - * Call wrmsr as directly as possible to avoid tracing clobbering - * local register variable used for memory pointer. + * Disable hardware prefetchers. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - mem_r = plr->kmem; + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed * instructions into the L1 instruction cache. @@ -932,157 +909,240 @@ static int measure_cycles_lat_fn(void *_plr) return 0; } -static int measure_cycles_perf_fn(void *_plr) +/* + * Create a perf_event_attr for the hit and miss perf events that will + * be used during the performance measurement. A perf_event maintains + * a pointer to its perf_event_attr so a unique attribute structure is + * created for each perf_event. + * + * The actual configuration of the event is set right before use in order + * to use the X86_CONFIG macro. + */ +static struct perf_event_attr perf_miss_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +static struct perf_event_attr perf_hit_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +struct residency_counts { + u64 miss_before, hits_before; + u64 miss_after, hits_after; +}; + +static int measure_residency_fn(struct perf_event_attr *miss_attr, + struct perf_event_attr *hit_attr, + struct pseudo_lock_region *plr, + struct residency_counts *counts) { - unsigned long long l3_hits = 0, l3_miss = 0; - u64 l3_hit_bits = 0, l3_miss_bits = 0; - struct pseudo_lock_region *plr = _plr; - unsigned long long l2_hits, l2_miss; - u64 l2_hit_bits, l2_miss_bits; - unsigned long i; -#ifdef CONFIG_KASAN - /* - * The registers used for local register variables are also used - * when KASAN is active. When KASAN is active we use regular variables - * at the cost of including cache access latency to these variables - * in the measurements. - */ + u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; + struct perf_event *miss_event, *hit_event; + int hit_pmcnum, miss_pmcnum; unsigned int line_size; unsigned int size; + unsigned long i; void *mem_r; -#else - register unsigned int line_size asm("esi"); - register unsigned int size asm("edi"); -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_KASAN */ + u64 tmp; + + miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(miss_event)) + goto out; + + hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(hit_event)) + goto out_miss; + + local_irq_disable(); + /* + * Check any possible error state of events used by performing + * one local read. + */ + if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + + /* + * Disable hardware prefetchers. + */ + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + + /* Initialize rest of local variables */ + /* + * Performance event has been validated right before this with + * interrupts disabled - it is thus safe to read the counter index. + */ + miss_pmcnum = x86_perf_rdpmc_index(miss_event); + hit_pmcnum = x86_perf_rdpmc_index(hit_event); + line_size = READ_ONCE(plr->line_size); + mem_r = READ_ONCE(plr->kmem); + size = READ_ONCE(plr->size); + + /* + * Read counter variables twice - first to load the instructions + * used in L1 cache, second to capture accurate value that does not + * include cache misses incurred because of instruction loads. + */ + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * From SDM: Performing back-to-back fast reads are not guaranteed + * to be monotonic. + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_after); + rdpmcl(miss_pmcnum, miss_after); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + /* Re-enable hardware prefetchers */ + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); +out_hit: + perf_event_release_kernel(hit_event); +out_miss: + perf_event_release_kernel(miss_event); +out: + /* + * All counts will be zero on failure. + */ + counts->miss_before = miss_before; + counts->hits_before = hits_before; + counts->miss_after = miss_after; + counts->hits_after = hits_after; + return 0; +} + +static int measure_l2_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; /* * Non-architectural event for the Goldmont Microarchitecture * from Intel x86 Architecture Software Developer Manual (SDM): * MEM_LOAD_UOPS_RETIRED D1H (event number) * Umask values: - * L1_HIT 01H * L2_HIT 02H - * L1_MISS 08H * L2_MISS 10H - * - * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event - * has two "no fix" errata associated with it: BDM35 and BDM100. On - * this platform we use the following events instead: - * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) - * REFERENCES FFH - * MISS 3FH - * LONGEST_LAT_CACHE 2EH (Documented in SDM) - * REFERENCE 4FH - * MISS 41H */ - - /* - * Start by setting flags for IA32_PERFEVTSELx: - * OS (Operating system mode) 0x2 - * INT (APIC interrupt enable) 0x10 - * EN (Enable counter) 0x40 - * - * Then add the Umask value and event number to select performance - * event. - */ - switch (boot_cpu_data.x86_model) { case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GEMINI_LAKE: - l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; - l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; - break; - case INTEL_FAM6_BROADWELL_X: - /* On BDW the l2_hit_bits count references, not hits */ - l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; - l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; - /* On BDW the l3_hit_bits count references, not hits */ - l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; - l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + perf_miss_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x10); + perf_hit_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x2); break; default: goto out; } - local_irq_disable(); + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * Call wrmsr direcly to avoid the local register variables from - * being overwritten due to reordering of their assignment with - * the wrmsr calls. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); - /* Disable events and reset counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); - } - /* Set and enable the L2 counters */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits); - } - mem_r = plr->kmem; - size = plr->size; - line_size = plr->line_size; - for (i = 0; i < size; i += line_size) { - asm volatile("mov (%0,%1,1), %%eax\n\t" - : - : "r" (mem_r), "r" (i) - : "%eax", "memory"); - } + trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, + counts.miss_after - counts.miss_before); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_l3_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + /* - * Call wrmsr directly (no tracing) to not influence - * the cache access counters as they are disabled. + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform the following events are used instead: + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H */ - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, - l2_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, - l2_miss_bits & ~(0x40ULL << 16)); - if (l3_hit_bits > 0) { - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, - l3_hit_bits & ~(0x40ULL << 16)); - pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, - l3_miss_bits & ~(0x40ULL << 16)); - } - l2_hits = native_read_pmc(0); - l2_miss = native_read_pmc(1); - if (l3_hit_bits > 0) { - l3_hits = native_read_pmc(2); - l3_miss = native_read_pmc(3); + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* On BDW the hit event counts references, not hits */ + perf_hit_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x4f); + perf_miss_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x41); + break; + default: + goto out; } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); - local_irq_enable(); + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); /* - * On BDW we count references and misses, need to adjust. Sometimes - * the "hits" counter is a bit more than the references, for - * example, x references but x + 1 hits. To not report invalid - * hit values in this case we treat that as misses eaqual to - * references. + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); - trace_pseudo_lock_l2(l2_hits, l2_miss); - if (l3_hit_bits > 0) { - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) - l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); - trace_pseudo_lock_l3(l3_hits, l3_miss); + + counts.miss_after -= counts.miss_before; + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + /* + * On BDW references and misses are counted, need to adjust. + * Sometimes the "hits" counter is a bit more than the + * references, for example, x references but x + 1 hits. + * To not report invalid hit values in this case we treat + * that as misses equal to references. + */ + /* First compute the number of cache references measured */ + counts.hits_after -= counts.hits_before; + /* Next convert references to cache hits */ + counts.hits_after -= min(counts.miss_after, counts.hits_after); + } else { + counts.hits_after -= counts.hits_before; } + trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); out: plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -1114,6 +1174,11 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + if (!plr->d) { + ret = -ENODEV; + goto out; + } + plr->thread_done = 0; cpu = cpumask_first(&plr->d->cpu_mask); if (!cpu_online(cpu)) { @@ -1121,13 +1186,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) goto out; } + plr->cpu = cpu; + if (sel == 1) thread = kthread_create_on_node(measure_cycles_lat_fn, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); else if (sel == 2) - thread = kthread_create_on_node(measure_cycles_perf_fn, plr, + thread = kthread_create_on_node(measure_l2_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 3) + thread = kthread_create_on_node(measure_l3_residency, plr, cpu_to_node(cpu), "pseudo_lock_measure/%u", cpu); @@ -1171,7 +1243,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, buf[buf_size] = '\0'; ret = kstrtoint(buf, 10, &sel); if (ret == 0) { - if (sel != 1) + if (sel != 1 && sel != 2 && sel != 3) return -EINVAL; ret = debugfs_file_get(file->f_path.dentry); if (ret) @@ -1427,6 +1499,11 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) plr = rdtgrp->plr; + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + /* * Task is required to run with affinity to the cpus associated * with the pseudo-locked region. If this is not the case the task diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..f27b8115ffa2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) @@ -261,17 +268,27 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdtgroup *rdtgrp; + struct cpumask *mask; int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); - else + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + } } else { ret = -ENOENT; } @@ -802,7 +819,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); @@ -954,7 +971,78 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, } /** - * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * rdt_cdp_peer_get - Retrieve CDP peer if it exists + * @r: RDT resource to which RDT domain @d belongs + * @d: Cache instance for which a CDP peer is requested + * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer) + * Used to return the result. + * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer) + * Used to return the result. + * + * RDT resources are managed independently and by extension the RDT domains + * (RDT resource instances) are managed independently also. The Code and + * Data Prioritization (CDP) RDT resources, while managed independently, + * could refer to the same underlying hardware. For example, + * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache. + * + * When provided with an RDT resource @r and an instance of that RDT + * resource @d rdt_cdp_peer_get() will return if there is a peer RDT + * resource and the exact instance that shares the same hardware. + * + * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists. + * If a CDP peer was found, @r_cdp will point to the peer RDT resource + * and @d_cdp will point to the peer RDT domain. + */ +static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, + struct rdt_resource **r_cdp, + struct rdt_domain **d_cdp) +{ + struct rdt_resource *_r_cdp = NULL; + struct rdt_domain *_d_cdp = NULL; + int ret = 0; + + switch (r->rid) { + case RDT_RESOURCE_L3DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + break; + case RDT_RESOURCE_L3CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + break; + case RDT_RESOURCE_L2DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE]; + break; + case RDT_RESOURCE_L2CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA]; + break; + default: + ret = -ENOENT; + goto out; + } + + /* + * When a new CPU comes online and CDP is enabled then the new + * RDT domains (if any) associated with both CDP RDT resources + * are added in the same CPU online routine while the + * rdtgroup_mutex is held. It should thus not happen for one + * RDT domain to exist and be associated with its RDT CDP + * resource but there is no RDT domain associated with the + * peer RDT CDP resource. Hence the WARN. + */ + _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); + if (WARN_ON(!_d_cdp)) { + _r_cdp = NULL; + ret = -EINVAL; + } + +out: + *r_cdp = _r_cdp; + *d_cdp = _d_cdp; + + return ret; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. @@ -968,33 +1056,34 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, * is false then overlaps with any resource group or hardware entities * will be considered. * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * * Return: false if CBM does not overlap, true if it does. */ -bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - u32 _cbm, int closid, bool exclusive) +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) { - unsigned long *cbm = (unsigned long *)&_cbm; - unsigned long *ctrl_b; enum rdtgrp_mode mode; + unsigned long ctrl_b; u32 *ctrl; int i; /* Check for any overlap with regions used by hardware directly */ if (!exclusive) { - if (bitmap_intersects(cbm, - (unsigned long *)&r->cache.shareable_bits, - r->cache.cbm_len)) + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) return true; } /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { - ctrl_b = (unsigned long *)ctrl; + for (i = 0; i < closids_supported(); i++, ctrl++) { + ctrl_b = *ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { if (exclusive) { if (mode == RDT_MODE_EXCLUSIVE) return true; @@ -1009,6 +1098,41 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, } /** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + struct rdt_resource *r_cdp; + struct rdt_domain *d_cdp; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive)) + return true; + + if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0) + return false; + + return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive); +} + +/** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive * * An exclusive resource group implies that there should be no sharing of @@ -1024,16 +1148,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1085,7 +1220,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } @@ -1121,15 +1255,18 @@ out: * computed by first dividing the total cache size by the CBM length to * determine how many bytes each bit in the bitmask represents. The result * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. */ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_domain *d, u32 cbm) + struct rdt_domain *d, unsigned long cbm) { struct cpu_cacheinfo *ci; unsigned int size = 0; int num_b, i; - num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len); + num_b = bitmap_weight(&cbm, r->cache.cbm_len); ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); for (i = 0; i < ci->num_leaves; i++) { if (ci->info_list[i].level == r->cache_level) { @@ -1155,8 +1292,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + int ret = 0; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1165,15 +1303,23 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->r, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + } goto out; } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1327,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; @@ -1193,7 +1344,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - return 0; + return ret; } /* rdtgroup information files for one cache resource. */ @@ -2327,28 +2478,48 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { + struct rdt_resource *r_cdp = NULL; + struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; u32 closid = rdtgrp->closid; struct rdt_resource *r; + unsigned long tmp_cbm; enum rdtgrp_mode mode; struct rdt_domain *d; + u32 peer_ctl, *ctrl; int i, ret; - u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) break; - used_b |= *ctrl; + /* + * If CDP is active include peer + * domain's usage to ensure there + * is no overlap with an exclusive + * group. + */ + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl; + d->new_ctrl |= *ctrl | peer_ctl; } } if (d->plr && d->plr->cbm > 0) @@ -2361,9 +2532,14 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) * modify the CBM based on system availability. */ cbm_ensure_valid(&d->new_ctrl, r); - if (bitmap_weight((unsigned long *) &d->new_ctrl, - r->cache.cbm_len) < - r->cache.min_cbm_bits) { + /* + * Assign the u32 CBM to an unsigned long to ensure + * that bitmap_weight() does not access out-of-bound + * memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < + r->cache.min_cbm_bits) { rdt_last_cmd_printf("no space on %s:%d\n", r->name, d->id); return -ENOSPC; @@ -2373,6 +2549,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); @@ -2760,6 +2942,13 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); + + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA])) + seq_puts(seq, ",mba_MBps"); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 97685a0c3175..27f394ac983f 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -38,9 +38,6 @@ static struct mce_log_buffer mcelog = { static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); -/* User mode helper program triggered by machine check event */ -extern char mce_helper[128]; - static int dev_mce_log(struct notifier_block *nb, unsigned long val, void *data) { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index c805a06e14c3..1fc424c40a31 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -108,6 +108,9 @@ static void setup_inj_struct(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; } /* Update fake mce registers on current CPU. */ @@ -576,6 +579,9 @@ static int inj_bank_set(void *data, u64 val) m->bank = val; do_inject(); + /* Reset injection struct */ + setup_inj_struct(&i_mce); + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index f34d89c01edc..44396d521987 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -336,7 +336,8 @@ int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = void __init mcheck_vendor_init_severity(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) mce_severity = mce_severity_amd; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 953b3ce92dcc..8cb3c02980cf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -270,7 +270,7 @@ static void print_mce(struct mce *m) { __print_mce(m); - if (m->cpuvendor != X86_VENDOR_AMD) + if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -508,9 +508,9 @@ static int mce_usable_address(struct mce *m) bool mce_is_memory_error(struct mce *m) { - if (m->cpuvendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD || + m->cpuvendor == X86_VENDOR_HYGON) { return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes @@ -539,6 +539,9 @@ static bool mce_is_correctable(struct mce *m) if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) return false; + if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) + return false; + if (m->status & MCI_STATUS_UC) return false; @@ -1315,7 +1318,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) local_irq_disable(); ist_end_non_atomic(); } else { - if (!fixup_exception(regs, X86_TRAP_MC)) + if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, NULL); } @@ -1705,7 +1708,7 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) */ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); @@ -1746,6 +1749,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_amd_feature_init(c); break; } + + case X86_VENDOR_HYGON: + mce_hygon_feature_init(c); + break; + case X86_VENDOR_CENTAUR: mce_centaur_feature_init(c); break; @@ -1971,12 +1979,14 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide. + * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs + * are socket-wide. * Disabling them for just a single offlined CPU is bad, since it will * inhibit reporting for all shared resources on the socket like the * last level cache (LLC), the integrated memory controller (iMC), etc. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 0624957aa068..07b5fc00b188 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -504,6 +504,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; + enum ucode_state ret; u32 rev, dummy; BUG_ON(raw_smp_processor_id() != cpu); @@ -521,9 +522,8 @@ static enum ucode_state apply_microcode_amd(int cpu) /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { - c->microcode = rev; - uci->cpu_sig.rev = rev; - return UCODE_OK; + ret = UCODE_OK; + goto out; } if (__apply_microcode_amd(mc_amd)) { @@ -531,13 +531,21 @@ static enum ucode_state apply_microcode_amd(int cpu) cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; } - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, - mc_amd->hdr.patch_id); - uci->cpu_sig.rev = mc_amd->hdr.patch_id; - c->microcode = mc_amd->hdr.patch_id; + rev = mc_amd->hdr.patch_id; + ret = UCODE_UPDATED; + + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - return UCODE_UPDATED; +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; + + return ret; } static int install_equiv_cpu_table(const u8 *buf) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 97ccf4c3b45b..16936a24795c 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -795,6 +795,7 @@ static enum ucode_state apply_microcode_intel(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_intel *mc; + enum ucode_state ret; static int prev_rev; u32 rev; @@ -817,9 +818,8 @@ static enum ucode_state apply_microcode_intel(int cpu) */ rev = intel_get_microcode_revision(); if (rev >= mc->hdr.rev) { - uci->cpu_sig.rev = rev; - c->microcode = rev; - return UCODE_OK; + ret = UCODE_OK; + goto out; } /* @@ -848,10 +848,17 @@ static enum ucode_state apply_microcode_intel(int cpu) prev_rev = rev; } + ret = UCODE_UPDATED; + +out: uci->cpu_sig.rev = rev; - c->microcode = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; - return UCODE_UPDATED; + return ret; } static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index ad12733f6058..1c72f3819eb1 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -199,6 +199,16 @@ static unsigned long hv_get_tsc_khz(void) return freq / 1000; } +#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV) +static void __init hv_smp_prepare_boot_cpu(void) +{ + native_smp_prepare_boot_cpu(); +#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS) + hv_init_spinlocks(); +#endif +} +#endif + static void __init ms_hyperv_init_platform(void) { int hv_host_info_eax; @@ -303,6 +313,10 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) alloc_intr_gate(HYPERV_STIMER0_VECTOR, hv_stimer0_callback_vector); + +# ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; +# endif #endif } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 765afd599039..3668c5df90c6 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -831,7 +831,8 @@ int __init amd_special_default_mtrr(void) { u32 l, h; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return 0; if (boot_cpu_data.x86 < 0xf) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 9a19c800fe40..507039c20128 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -127,7 +127,7 @@ static void __init set_num_var_ranges(void) if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - else if (is_cpu(AMD)) + else if (is_cpu(AMD) || is_cpu(HYGON)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d389083330c5..9556930cd8c1 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -46,6 +46,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: case X86_VENDOR_AMD: if (msr >= MSR_F15H_PERF_CTR) return (msr - MSR_F15H_PERF_CTR) >> 1; @@ -74,6 +75,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) { /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: case X86_VENDOR_AMD: if (msr >= MSR_F15H_PERF_CTL) return (msr - MSR_F15H_PERF_CTL) >> 1; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 8e005329648b..d9ab49bed8af 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -97,14 +97,14 @@ static void __init vmware_sched_clock_setup(void) d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, d->cyc2ns_shift); - pv_time_ops.sched_clock = vmware_sched_clock; + pv_ops.time.sched_clock = vmware_sched_clock; pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); } static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; - pv_cpu_ops.io_delay = paravirt_nop; + pv_ops.cpu.io_delay = paravirt_nop; if (vmware_tsc_khz && vmw_sched_clock) vmware_sched_clock_setup(); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 4f2e0778feac..eb8ab3915268 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -11,40 +11,62 @@ #include <linux/uaccess.h> #include <linux/io.h> -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf, + bool encrypted) { void *vaddr; if (!csize) return 0; - vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (encrypted) + vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); + else + vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, vaddr + offset, csize)) { - iounmap(vaddr); + if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { + iounmap((void __iomem *)vaddr); return -EFAULT; } } else memcpy(buf, vaddr + offset, csize); set_iounmap_nonlazy(); - iounmap(vaddr); + iounmap((void __iomem *)vaddr); return csize; } + +/** + * copy_oldmem_page - copy one page of memory + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from the old kernel's memory. For this page, there is no pte + * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); +} + +/** + * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the + * memory with the encryption mask set to accomodate kdump on SME-enabled + * machines. + */ +ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); +} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f56895106ccf..2b5886401e5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, 0); + __show_regs(regs, SHOW_REGS_SHORT); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -344,7 +344,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, true); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL); if (!signr) return; @@ -407,14 +407,9 @@ void die(const char *str, struct pt_regs *regs, long err) void show_regs(struct pt_regs *regs) { - bool all = true; - show_regs_print_info(KERN_DEFAULT); - if (IS_ENABLED(CONFIG_X86_32)) - all = !user_mode(regs); - - __show_regs(regs, all); + __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index f260e452e4f8..e8c8c5d78dbd 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -7,11 +7,17 @@ #include <linux/eisa.h> #include <linux/io.h> +#include <xen/xen.h> + static __init int eisa_bus_probe(void) { - void __iomem *p = ioremap(0x0FFFD9, 4); + void __iomem *p; + + if (xen_pv_domain() && !xen_initial_domain()) + return 0; - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + p = ioremap(0x0FFFD9, 4); + if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; iounmap(p); return 0; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 23f1691670b6..61a949d84dfa 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -314,7 +314,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Validate and sanitize the copied state. */ - struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index ec6fefbfd3c0..76fa3b836598 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -37,6 +37,7 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_verify_bootdata_version(); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..5dc377dc9d7b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include <asm/bootparam_utils.h> #include <asm/microcode.h> #include <asm/kasan.h> +#include <asm/fixmap.h> /* * Manage page tables very early on. @@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These @@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr, sme_encrypt_kernel(bp); /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. */ @@ -439,6 +457,8 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_verify_bootdata_version(); + x86_early_init_platform_quirks(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..747c758f67b7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,8 +24,9 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/fixmap.h> -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include <asm/asm-offsets.h> #include <asm/paravirt.h> #define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index eeea935e9bb5..aac0c1f7e354 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,55 +42,40 @@ static void __ref __jump_label_transform(struct jump_entry *entry, void *(*poker)(void *, const void *, size_t), int init) { - union jump_code_union code; + union jump_code_union jmp; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + const void *expect, *code; + int line; + + jmp.jump = 0xe9; + jmp.offset = jump_entry_target(entry) - + (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); if (early_boot_irqs_disabled) poker = text_poke_early; if (type == JUMP_LABEL_JMP) { if (init) { - /* - * Jump label is enabled for the first time. - * So we expect a default_nop... - */ - if (unlikely(memcmp((void *)entry->code, default_nop, 5) - != 0)) - bug_at((void *)entry->code, __LINE__); + expect = default_nop; line = __LINE__; } else { - /* - * ...otherwise expect an ideal_nop. Otherwise - * something went horribly wrong. - */ - if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) - != 0)) - bug_at((void *)entry->code, __LINE__); + expect = ideal_nop; line = __LINE__; } - code.jump = 0xe9; - code.offset = entry->target - - (entry->code + JUMP_LABEL_NOP_SIZE); + code = &jmp.code; } else { - /* - * We are disabling this jump label. If it is not what - * we think it is, then something must have gone wrong. - * If this is the first initialization call, then we - * are converting the default nop to the ideal nop. - */ if (init) { - if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) - bug_at((void *)entry->code, __LINE__); + expect = default_nop; line = __LINE__; } else { - code.jump = 0xe9; - code.offset = entry->target - - (entry->code + JUMP_LABEL_NOP_SIZE); - if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) - bug_at((void *)entry->code, __LINE__); + expect = &jmp.code; line = __LINE__; } - memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + + code = ideal_nop; } + if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) + bug_at((void *)jump_entry_code(entry), line); + /* * Make text_poke_bp() a default fallback poker. * @@ -99,11 +84,14 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction * */ - if (poker) - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); - else - text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, - (void *)entry->code + JUMP_LABEL_NOP_SIZE); + if (poker) { + (*poker)((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); + return; + } + + text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE, + (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b0d1e81c96bb..c33b06f5faa4 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1020,64 +1020,18 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) */ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs, trapnr)) - return 1; - - /* - * fixup routine could not handle it, - * Let do_page_fault() fix it. - */ } return 0; } NOKPROBE_SYMBOL(kprobe_fault_handler); -/* - * Wrapper routine for handling exceptions. - */ -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, - void *data) -{ - struct die_args *args = data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - if (val == DIE_GPF) { - /* - * To be potentially processing a kprobe fault and to - * trust the result from kprobe_running(), we have - * be non-preemptible. - */ - if (!preemptible() && kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - } - return ret; -} -NOKPROBE_SYMBOL(kprobe_exceptions_notify); - bool arch_within_kprobe_blacklist(unsigned long addr) { - bool is_in_entry_trampoline_section = false; - -#ifdef CONFIG_X86_64 - is_in_entry_trampoline_section = - (addr >= (unsigned long)__entry_trampoline_start && - addr < (unsigned long)__entry_trampoline_end); -#endif return (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - is_in_entry_trampoline_section; + addr < (unsigned long)__entry_text_end); } int __init arch_init_kprobes(void) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index eaf02f2e7300..40b16b270656 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -179,7 +179,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + preempt_enable(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d9b71924c23c..ba4bfb7f6a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -283,7 +283,7 @@ static void __init paravirt_ops_setup(void) pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) - pv_cpu_ops.io_delay = kvm_io_delay; + pv_ops.cpu.io_delay = kvm_io_delay; #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; @@ -632,14 +632,14 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_time_ops.steal_clock = kvm_steal_clock; + pv_ops.time.steal_clock = kvm_steal_clock; } if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; - pv_mmu_ops.tlb_remove_table = tlb_remove_table; + pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -850,13 +850,14 @@ void __init kvm_spinlock_init(void) return; __pv_init_lock_hash(); - pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_lock_ops.wait = kvm_wait; - pv_lock_ops.kick = kvm_kick_cpu; + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = + PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = kvm_wait; + pv_ops.lock.kick = kvm_kick_cpu; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_lock_ops.vcpu_is_preempted = + pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..30084ecaa20f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <linux/sched/clock.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/set_memory.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -116,13 +118,13 @@ static u64 kvm_sched_clock_read(void) static inline void kvm_sched_clock_init(bool stable) { if (!stable) { - pv_time_ops.sched_clock = kvm_clock_read; + pv_ops.time.sched_clock = kvm_clock_read; clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); - pv_time_ops.sched_clock = kvm_sched_clock_read; + pv_ops.time.sched_clock = kvm_sched_clock_read; pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 733e6ace0fa4..ab18e0884dc6 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); ldt->slot = slot; return 0; diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S new file mode 100644 index 000000000000..161c95059044 --- /dev/null +++ b/arch/x86/kernel/macros.S @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This file includes headers whose assembly part includes macros which are + * commonly used. The macros are precompiled into assmebly file which is later + * assembled together with each compiled file. + */ + +#include <linux/compiler.h> +#include <asm/refcount.h> +#include <asm/alternative-asm.h> +#include <asm/bug.h> +#include <asm/paravirt.h> +#include <asm/asm.h> +#include <asm/cpufeature.h> +#include <asm/jump_label.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f58336af095c..b052e883dd8c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -201,6 +201,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, goto overflow; #endif break; + case R_X86_64_PC64: + if (*(u64 *)loc != 0) + goto invalid_relocation; + val -= (u64)loc; + *(u64 *)loc = val; + break; default: pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 71f2d1125ec0..4f75d0cf6305 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -17,7 +17,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) { - return pv_lock_ops.queued_spin_unlock.func == + return pv_ops.lock.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } @@ -29,17 +29,6 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); bool pv_is_native_vcpu_is_preempted(void) { - return pv_lock_ops.vcpu_is_preempted.func == + return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } - -struct pv_lock_ops pv_lock_ops = { -#ifdef CONFIG_SMP - .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, - .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), - .wait = paravirt_nop, - .kick = paravirt_nop, - .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), -#endif /* SMP */ -}; -EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..e4d4df37922a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -81,17 +81,15 @@ struct branch { u32 delta; } __attribute__((packed)); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len) +static unsigned paravirt_patch_call(void *insnbuf, const void *target, + unsigned long addr, unsigned len) { struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -103,15 +101,16 @@ unsigned paravirt_patch_call(void *insnbuf, return 5; } -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len) +#ifdef CONFIG_PARAVIRT_XXL +static unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len) { struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -121,6 +120,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, return 5; } +#endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -130,29 +130,14 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -/* - * Neat trick to map patch type back to the call within the - * corresponding structure. - */ -static void *get_call_destination(u8 type) -{ - struct paravirt_patch_template tmpl = { - .pv_init_ops = pv_init_ops, - .pv_time_ops = pv_time_ops, - .pv_cpu_ops = pv_cpu_ops, - .pv_irq_ops = pv_irq_ops, - .pv_mmu_ops = pv_mmu_ops, -#ifdef CONFIG_PARAVIRT_SPINLOCKS - .pv_lock_ops = pv_lock_ops, -#endif - }; - return *((void **)&tmpl + type); -} - -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, +unsigned paravirt_patch_default(u8 type, void *insnbuf, unsigned long addr, unsigned len) { - void *opfunc = get_call_destination(type); + /* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ + void *opfunc = *((void **)&pv_ops + type); unsigned ret; if (opfunc == NULL) @@ -167,15 +152,15 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insnbuf, len); - else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) +#ifdef CONFIG_PARAVIRT_XXL + else if (type == PARAVIRT_PATCH(cpu.iret) || + type == PARAVIRT_PATCH(cpu.usergs_sysret64)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); +#endif else - /* Otherwise call the function; assume target could - clobber any caller-save reg */ - ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY, - addr, clobbers, len); + /* Otherwise call the function. */ + ret = paravirt_patch_call(insnbuf, opfunc, addr, len); return ret; } @@ -281,6 +266,7 @@ void paravirt_flush_lazy_mmu(void) preempt_enable(); } +#ifdef CONFIG_PARAVIRT_XXL void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); @@ -301,6 +287,7 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } +#endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { @@ -312,85 +299,16 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", +#ifdef CONFIG_PARAVIRT_XXL .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ #ifdef CONFIG_X86_64 .extra_user_64bit_cs = __USER_CS, #endif -}; - -struct pv_init_ops pv_init_ops = { - .patch = native_patch, -}; - -struct pv_time_ops pv_time_ops = { - .sched_clock = native_sched_clock, - .steal_clock = native_steal_clock, -}; - -__visible struct pv_irq_ops pv_irq_ops = { - .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), - .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), - .safe_halt = native_safe_halt, - .halt = native_halt, -}; - -__visible struct pv_cpu_ops pv_cpu_ops = { - .cpuid = native_cpuid, - .get_debugreg = native_get_debugreg, - .set_debugreg = native_set_debugreg, - .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, - .write_cr4 = native_write_cr4, -#ifdef CONFIG_X86_64 - .read_cr8 = native_read_cr8, - .write_cr8 = native_write_cr8, #endif - .wbinvd = native_wbinvd, - .read_msr = native_read_msr, - .write_msr = native_write_msr, - .read_msr_safe = native_read_msr_safe, - .write_msr_safe = native_write_msr_safe, - .read_pmc = native_read_pmc, - .load_tr_desc = native_load_tr_desc, - .set_ldt = native_set_ldt, - .load_gdt = native_load_gdt, - .load_idt = native_load_idt, - .store_tr = native_store_tr, - .load_tls = native_load_tls, -#ifdef CONFIG_X86_64 - .load_gs_index = native_load_gs_index, -#endif - .write_ldt_entry = native_write_ldt_entry, - .write_gdt_entry = native_write_gdt_entry, - .write_idt_entry = native_write_idt_entry, - - .alloc_ldt = paravirt_nop, - .free_ldt = paravirt_nop, - - .load_sp0 = native_load_sp0, - -#ifdef CONFIG_X86_64 - .usergs_sysret64 = native_usergs_sysret64, -#endif - .iret = native_iret, - .swapgs = native_swapgs, - - .set_iopl_mask = native_set_iopl_mask, - .io_delay = native_io_delay, - - .start_context_switch = paravirt_nop, - .end_context_switch = paravirt_nop, }; -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); -NOKPROBE_SYMBOL(native_load_idt); - #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) /* 32-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) @@ -399,85 +317,171 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops __ro_after_init = { +struct paravirt_patch_template pv_ops = { + /* Init ops. */ + .init.patch = native_patch, - .read_cr2 = native_read_cr2, - .write_cr2 = native_write_cr2, - .read_cr3 = __native_read_cr3, - .write_cr3 = native_write_cr3, + /* Time ops. */ + .time.sched_clock = native_sched_clock, + .time.steal_clock = native_steal_clock, - .flush_tlb_user = native_flush_tlb, - .flush_tlb_kernel = native_flush_tlb_global, - .flush_tlb_one_user = native_flush_tlb_one_user, - .flush_tlb_others = native_flush_tlb_others, - .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page, + /* Cpu ops. */ + .cpu.io_delay = native_io_delay, - .pgd_alloc = __paravirt_pgd_alloc, - .pgd_free = paravirt_nop, +#ifdef CONFIG_PARAVIRT_XXL + .cpu.cpuid = native_cpuid, + .cpu.get_debugreg = native_get_debugreg, + .cpu.set_debugreg = native_set_debugreg, + .cpu.read_cr0 = native_read_cr0, + .cpu.write_cr0 = native_write_cr0, + .cpu.write_cr4 = native_write_cr4, +#ifdef CONFIG_X86_64 + .cpu.read_cr8 = native_read_cr8, + .cpu.write_cr8 = native_write_cr8, +#endif + .cpu.wbinvd = native_wbinvd, + .cpu.read_msr = native_read_msr, + .cpu.write_msr = native_write_msr, + .cpu.read_msr_safe = native_read_msr_safe, + .cpu.write_msr_safe = native_write_msr_safe, + .cpu.read_pmc = native_read_pmc, + .cpu.load_tr_desc = native_load_tr_desc, + .cpu.set_ldt = native_set_ldt, + .cpu.load_gdt = native_load_gdt, + .cpu.load_idt = native_load_idt, + .cpu.store_tr = native_store_tr, + .cpu.load_tls = native_load_tls, +#ifdef CONFIG_X86_64 + .cpu.load_gs_index = native_load_gs_index, +#endif + .cpu.write_ldt_entry = native_write_ldt_entry, + .cpu.write_gdt_entry = native_write_gdt_entry, + .cpu.write_idt_entry = native_write_idt_entry, - .alloc_pte = paravirt_nop, - .alloc_pmd = paravirt_nop, - .alloc_pud = paravirt_nop, - .alloc_p4d = paravirt_nop, - .release_pte = paravirt_nop, - .release_pmd = paravirt_nop, - .release_pud = paravirt_nop, - .release_p4d = paravirt_nop, + .cpu.alloc_ldt = paravirt_nop, + .cpu.free_ldt = paravirt_nop, - .set_pte = native_set_pte, - .set_pte_at = native_set_pte_at, - .set_pmd = native_set_pmd, + .cpu.load_sp0 = native_load_sp0, - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, +#ifdef CONFIG_X86_64 + .cpu.usergs_sysret64 = native_usergs_sysret64, +#endif + .cpu.iret = native_iret, + .cpu.swapgs = native_swapgs, + + .cpu.set_iopl_mask = native_set_iopl_mask, + + .cpu.start_context_switch = paravirt_nop, + .cpu.end_context_switch = paravirt_nop, + + /* Irq ops. */ + .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.safe_halt = native_safe_halt, + .irq.halt = native_halt, +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Mmu ops. */ + .mmu.flush_tlb_user = native_flush_tlb, + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_others = native_flush_tlb_others, + .mmu.tlb_remove_table = + (void (*)(struct mmu_gather *, void *))tlb_remove_page, + + .mmu.exit_mmap = paravirt_nop, + +#ifdef CONFIG_PARAVIRT_XXL + .mmu.read_cr2 = native_read_cr2, + .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr3 = __native_read_cr3, + .mmu.write_cr3 = native_write_cr3, + + .mmu.pgd_alloc = __paravirt_pgd_alloc, + .mmu.pgd_free = paravirt_nop, + + .mmu.alloc_pte = paravirt_nop, + .mmu.alloc_pmd = paravirt_nop, + .mmu.alloc_pud = paravirt_nop, + .mmu.alloc_p4d = paravirt_nop, + .mmu.release_pte = paravirt_nop, + .mmu.release_pmd = paravirt_nop, + .mmu.release_pud = paravirt_nop, + .mmu.release_p4d = paravirt_nop, + + .mmu.set_pte = native_set_pte, + .mmu.set_pte_at = native_set_pte_at, + .mmu.set_pmd = native_set_pmd, + + .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, + .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, #if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE - .set_pte_atomic = native_set_pte_atomic, - .pte_clear = native_pte_clear, - .pmd_clear = native_pmd_clear, + .mmu.set_pte_atomic = native_set_pte_atomic, + .mmu.pte_clear = native_pte_clear, + .mmu.pmd_clear = native_pmd_clear, #endif - .set_pud = native_set_pud, + .mmu.set_pud = native_set_pud, - .pmd_val = PTE_IDENT, - .make_pmd = PTE_IDENT, + .mmu.pmd_val = PTE_IDENT, + .mmu.make_pmd = PTE_IDENT, #if CONFIG_PGTABLE_LEVELS >= 4 - .pud_val = PTE_IDENT, - .make_pud = PTE_IDENT, + .mmu.pud_val = PTE_IDENT, + .mmu.make_pud = PTE_IDENT, - .set_p4d = native_set_p4d, + .mmu.set_p4d = native_set_p4d, #if CONFIG_PGTABLE_LEVELS >= 5 - .p4d_val = PTE_IDENT, - .make_p4d = PTE_IDENT, + .mmu.p4d_val = PTE_IDENT, + .mmu.make_p4d = PTE_IDENT, - .set_pgd = native_set_pgd, + .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ - .pte_val = PTE_IDENT, - .pgd_val = PTE_IDENT, + .mmu.pte_val = PTE_IDENT, + .mmu.pgd_val = PTE_IDENT, - .make_pte = PTE_IDENT, - .make_pgd = PTE_IDENT, + .mmu.make_pte = PTE_IDENT, + .mmu.make_pgd = PTE_IDENT, - .dup_mmap = paravirt_nop, - .exit_mmap = paravirt_nop, - .activate_mm = paravirt_nop, + .mmu.dup_mmap = paravirt_nop, + .mmu.activate_mm = paravirt_nop, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - .flush = paravirt_nop, + .mmu.lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + .flush = paravirt_nop, }, - .set_fixmap = native_set_fixmap, + .mmu.set_fixmap = native_set_fixmap, +#endif /* CONFIG_PARAVIRT_XXL */ + +#if defined(CONFIG_PARAVIRT_SPINLOCKS) + /* Lock ops. */ +#ifdef CONFIG_SMP + .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .lock.queued_spin_unlock = + PV_CALLEE_SAVE(__native_queued_spin_unlock), + .lock.wait = paravirt_nop, + .lock.kick = paravirt_nop, + .lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__native_vcpu_is_preempted), +#endif /* SMP */ +#endif }; -EXPORT_SYMBOL_GPL(pv_time_ops); -EXPORT_SYMBOL (pv_cpu_ops); -EXPORT_SYMBOL (pv_mmu_ops); +#ifdef CONFIG_PARAVIRT_XXL +/* At this point, native_get/set_debugreg has real function entries */ +NOKPROBE_SYMBOL(native_get_debugreg); +NOKPROBE_SYMBOL(native_set_debugreg); +NOKPROBE_SYMBOL(native_load_idt); +#endif + +EXPORT_SYMBOL_GPL(pv_ops); EXPORT_SYMBOL_GPL(pv_info); -EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 758e69d72ebf..6368c22fa1fa 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -1,18 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/paravirt.h> -DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); -DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); -DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); -DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); -DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +#ifdef CONFIG_PARAVIRT_XXL +DEF_NATIVE(irq, irq_disable, "cli"); +DEF_NATIVE(irq, irq_enable, "sti"); +DEF_NATIVE(irq, restore_fl, "push %eax; popf"); +DEF_NATIVE(irq, save_fl, "pushf; pop %eax"); +DEF_NATIVE(cpu, iret, "iret"); +DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax"); +#endif #if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); +DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -30,53 +32,42 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) extern bool pv_is_native_spin_unlock(void); extern bool pv_is_native_vcpu_is_preempted(void); -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len) +unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) { - const unsigned char *start, *end; - unsigned ret; - #define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - start = start_##ops##_##x; \ - end = end_##ops##_##x; \ - goto patch_site + case PARAVIRT_PATCH(ops.x): \ + return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) + switch (type) { - PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_irq_ops, irq_enable); - PATCH_SITE(pv_irq_ops, restore_fl); - PATCH_SITE(pv_irq_ops, save_fl); - PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_mmu_ops, read_cr2); - PATCH_SITE(pv_mmu_ops, read_cr3); - PATCH_SITE(pv_mmu_ops, write_cr3); +#ifdef CONFIG_PARAVIRT_XXL + PATCH_SITE(irq, irq_disable); + PATCH_SITE(irq, irq_enable); + PATCH_SITE(irq, restore_fl); + PATCH_SITE(irq, save_fl); + PATCH_SITE(cpu, iret); + PATCH_SITE(mmu, read_cr2); + PATCH_SITE(mmu, read_cr3); + PATCH_SITE(mmu, write_cr3); +#endif #if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): - if (pv_is_native_spin_unlock()) { - start = start_pv_lock_ops_queued_spin_unlock; - end = end_pv_lock_ops_queued_spin_unlock; - goto patch_site; - } - goto patch_default; + case PARAVIRT_PATCH(lock.queued_spin_unlock): + if (pv_is_native_spin_unlock()) + return paravirt_patch_insns(ibuf, len, + start_lock_queued_spin_unlock, + end_lock_queued_spin_unlock); + break; - case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) { - start = start_pv_lock_ops_vcpu_is_preempted; - end = end_pv_lock_ops_vcpu_is_preempted; - goto patch_site; - } - goto patch_default; + case PARAVIRT_PATCH(lock.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) + return paravirt_patch_insns(ibuf, len, + start_lock_vcpu_is_preempted, + end_lock_vcpu_is_preempted); + break; #endif default: -patch_default: __maybe_unused - ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); - break; - -patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); break; } #undef PATCH_SITE - return ret; + return paravirt_patch_default(type, ibuf, addr, len); } diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 9cb98f7b07c9..7ca9cb726f4d 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -3,24 +3,26 @@ #include <asm/asm-offsets.h> #include <linux/stringify.h> -DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); -DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); -DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); -DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); -DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); -DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); +#ifdef CONFIG_PARAVIRT_XXL +DEF_NATIVE(irq, irq_disable, "cli"); +DEF_NATIVE(irq, irq_enable, "sti"); +DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq"); +DEF_NATIVE(irq, save_fl, "pushfq; popq %rax"); +DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax"); +DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax"); +DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3"); +DEF_NATIVE(cpu, wbinvd, "wbinvd"); -DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); -DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); +DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq"); +DEF_NATIVE(cpu, swapgs, "swapgs"); +#endif DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); +DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -38,55 +40,44 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) extern bool pv_is_native_spin_unlock(void); extern bool pv_is_native_vcpu_is_preempted(void); -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len) +unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) { - const unsigned char *start, *end; - unsigned ret; - #define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - start = start_##ops##_##x; \ - end = end_##ops##_##x; \ - goto patch_site - switch(type) { - PATCH_SITE(pv_irq_ops, restore_fl); - PATCH_SITE(pv_irq_ops, save_fl); - PATCH_SITE(pv_irq_ops, irq_enable); - PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, usergs_sysret64); - PATCH_SITE(pv_cpu_ops, swapgs); - PATCH_SITE(pv_mmu_ops, read_cr2); - PATCH_SITE(pv_mmu_ops, read_cr3); - PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): - if (pv_is_native_spin_unlock()) { - start = start_pv_lock_ops_queued_spin_unlock; - end = end_pv_lock_ops_queued_spin_unlock; - goto patch_site; - } - goto patch_default; + case PARAVIRT_PATCH(ops.x): \ + return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) - case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) { - start = start_pv_lock_ops_vcpu_is_preempted; - end = end_pv_lock_ops_vcpu_is_preempted; - goto patch_site; - } - goto patch_default; + switch (type) { +#ifdef CONFIG_PARAVIRT_XXL + PATCH_SITE(irq, restore_fl); + PATCH_SITE(irq, save_fl); + PATCH_SITE(irq, irq_enable); + PATCH_SITE(irq, irq_disable); + PATCH_SITE(cpu, usergs_sysret64); + PATCH_SITE(cpu, swapgs); + PATCH_SITE(cpu, wbinvd); + PATCH_SITE(mmu, read_cr2); + PATCH_SITE(mmu, read_cr3); + PATCH_SITE(mmu, write_cr3); #endif +#if defined(CONFIG_PARAVIRT_SPINLOCKS) + case PARAVIRT_PATCH(lock.queued_spin_unlock): + if (pv_is_native_spin_unlock()) + return paravirt_patch_insns(ibuf, len, + start_lock_queued_spin_unlock, + end_lock_queued_spin_unlock); + break; - default: -patch_default: __maybe_unused - ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + case PARAVIRT_PATCH(lock.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) + return paravirt_patch_insns(ibuf, len, + start_lock_vcpu_is_preempted, + end_lock_vcpu_is_preempted); break; +#endif -patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); + default: break; } #undef PATCH_SITE - return ret; + return paravirt_patch_default(type, ibuf, addr, len); } diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 661583662430..71c0b01d93b1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,10 +42,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, int __init pci_swiotlb_detect_4gb(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ -#ifdef CONFIG_X86_64 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; -#endif /* * If SME is active then swiotlb will be set to 1 so that bounce diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2924fd447e61..5046a3c9dec2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,7 +59,7 @@ #include <asm/intel_rdt_sched.h> #include <asm/proto.h> -void __show_regs(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -85,7 +85,7 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); - if (!all) + if (mode != SHOW_REGS_ALL) return; cr0 = read_cr0(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a451bc374b9b..31b4755369f0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -54,15 +54,14 @@ #include <asm/vdso.h> #include <asm/intel_rdt_sched.h> #include <asm/unistd.h> +#include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> #endif -__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); - /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -87,9 +86,17 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); - if (!all) + if (mode == SHOW_REGS_SHORT) return; + if (mode == SHOW_REGS_USER) { + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", + fs, shadowgs); + return; + } + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -278,6 +285,138 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); +} + +static unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +void x86_fsbase_write_cpu(unsigned long fsbase) +{ + /* + * Set the selector to 0 as a notion, that the segment base is + * overwritten, which will be checked for skipping the segment load + * during context switch. + */ + loadseg(FS, 0); + wrmsrl(MSR_FS_BASE, fsbase); +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + /* Set the selector to 0 for the same reason as %fs above. */ + loadseg(GS, 0); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (task->thread.fsindex == 0) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (task->thread.gsindex == 0) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(fsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.fsbase = fsbase; + if (task == current) + x86_fsbase_write_cpu(fsbase); + task->thread.fsindex = 0; + preempt_enable(); + + return 0; +} + +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + if (unlikely(gsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.gsbase = gsbase; + if (task == current) + x86_gsbase_write_cpu_inactive(gsbase); + task->thread.gsindex = 0; + preempt_enable(); + + return 0; +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -465,10 +604,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + x86_fsgsbase_load(prev, next); switch_fpu_finish(next_fpu, cpu); @@ -619,54 +755,25 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; - int doit = task == current; - int cpu; switch (option) { - case ARCH_SET_GS: - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.gsindex = 0; - task->thread.gsbase = arg2; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); - } - put_cpu(); + case ARCH_SET_GS: { + ret = x86_gsbase_write_task(task, arg2); break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.fsindex = 0; - task->thread.fsbase = arg2; - if (doit) { - /* set the selector to 0 to not confuse __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, arg2); - } - put_cpu(); + } + case ARCH_SET_FS: { + ret = x86_fsbase_write_task(task, arg2); break; + } case ARCH_GET_FS: { - unsigned long base; + unsigned long base = x86_fsbase_read_task(task); - if (doit) - rdmsrl(MSR_FS_BASE, base); - else - base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { - unsigned long base; + unsigned long base = x86_gsbase_read_task(task); - if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e2ee403865eb..ffae9b9740fd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -39,6 +39,7 @@ #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> +#include <asm/fsgsbase.h> #include "tls.h" @@ -396,12 +397,11 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl_64 - * to set either thread.fs or thread.fsindex and the - * corresponding GDT slot. + * When changing the FS base, use the same + * mechanism as for do_arch_prctl_64(). */ if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + return x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + return x86_gsbase_write_task(child, value); return 0; #endif } @@ -434,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) return get_flags(task); #ifdef CONFIG_X86_64 - case offsetof(struct user_regs_struct, fs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.fsbase; - } - case offsetof(struct user_regs_struct, gs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.gsbase; - } + case offsetof(struct user_regs_struct, fs_base): + return x86_fsbase_read_task(task); + case offsetof(struct user_regs_struct, gs_base): + return x86_gsbase_read_task(task); #endif } @@ -1369,33 +1359,18 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -static void fill_sigtrap_info(struct task_struct *tsk, - struct pt_regs *regs, - int error_code, int si_code, - struct siginfo *info) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; - info->si_signo = SIGTRAP; - info->si_code = si_code; - info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; -} - -void user_single_step_siginfo(struct task_struct *tsk, - struct pt_regs *regs, - struct siginfo *info) -{ - fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); + /* Send us the fake SIGTRAP */ + force_sig_fault(SIGTRAP, si_code, + user_mode(regs) ? (void __user *)regs->ip : NULL, tsk); } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +void user_single_step_report(struct pt_regs *regs) { - struct siginfo info; - - clear_siginfo(&info); - fill_sigtrap_info(tsk, regs, error_code, si_code, &info); - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); + send_sigtrap(current, regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4866badb235..7005f89bf3b2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1251,7 +1251,7 @@ void __init setup_arch(char **cmdline_p) x86_init.hyper.guest_late_init(); e820__reserve_resources(); - e820__register_nosave_regions(max_low_pfn); + e820__register_nosave_regions(max_pfn); x86_init.resources.reserve_resources(); @@ -1281,6 +1281,23 @@ void __init setup_arch(char **cmdline_p) unwind_init(); } +/* + * From boot protocol 2.14 onwards we expect the bootloader to set the + * version to "0x8000 | <used version>". In case we find a version >= 2.14 + * without the 0x8000 we assume the boot loader supports 2.13 only and + * reset the version accordingly. The 0x8000 flag is removed in any case. + */ +void __init x86_verify_bootdata_version(void) +{ + if (boot_params.hdr.version & VERSION_WRITTEN) + boot_params.hdr.version &= ~VERSION_WRITTEN; + else if (boot_params.hdr.version >= 0x020e) + boot_params.hdr.version = 0x020d; + + if (boot_params.hdr.version < 0x020e) + boot_params.hdr.acpi_rsdp_addr = 0; +} + #ifdef CONFIG_X86_32 static struct resource video_ram_resource = { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f02ecaf97904..5369d7fac797 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -676,6 +676,7 @@ static void __init smp_quirk_init_udelay(void) /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; return; @@ -1592,7 +1593,8 @@ static inline void mwait_play_dead(void) void *mwait_ptr; int i; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) return; if (!this_cpu_has(X86_FEATURE_MWAIT)) return; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be01328eb755..0e14f6c0d35e 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -10,6 +10,7 @@ * */ +#include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -25,7 +26,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) @@ -105,3 +106,24 @@ void __init time_init(void) { late_time_init = x86_late_time_init; } + +/* + * Sanity check the vdso related archdata content. + */ +void clocksource_arch_init(struct clocksource *cs) +{ + if (cs->archdata.vclock_mode == VCLOCK_NONE) + return; + + if (cs->archdata.vclock_mode > VCLOCK_MAX) { + pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", + cs->name, cs->archdata.vclock_mode); + cs->archdata.vclock_mode = VCLOCK_NONE; + } + + if (cs->mask != CLOCKSOURCE_MASK(64)) { + pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", + cs->name, cs->mask); + cs->archdata.vclock_mode = VCLOCK_NONE; + } +} diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 12cbe2b88c0f..738bf42b0218 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -111,8 +111,10 @@ int arch_register_cpu(int num) /* * Currently CPU0 is only hotpluggable on Intel platforms. Other * vendors can add hotplug support later. + * Xen PV guests don't support CPU0 hotplug at all. */ - if (c->x86_vendor != X86_VENDOR_INTEL) + if (c->x86_vendor != X86_VENDOR_INTEL || + boot_cpu_has(X86_FEATURE_XENPV)) cpu0_hotpluggable = 0; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e6db475164ed..8f6dcd88202e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -189,7 +189,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr) } static nokprobe_inline int -do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, +do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { @@ -202,11 +202,8 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, error_code, trapnr)) return 0; } - return -1; - } - - if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + } else if (!user_mode(regs)) { + if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; @@ -214,49 +211,6 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, die(str, regs, error_code); } - return -1; -} - -static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, - siginfo_t *info) -{ - unsigned long siaddr; - int sicode; - - switch (trapnr) { - default: - return SEND_SIG_PRIV; - - case X86_TRAP_DE: - sicode = FPE_INTDIV; - siaddr = uprobe_get_trap_addr(regs); - break; - case X86_TRAP_UD: - sicode = ILL_ILLOPN; - siaddr = uprobe_get_trap_addr(regs); - break; - case X86_TRAP_AC: - sicode = BUS_ADRALN; - siaddr = 0; - break; - } - - info->si_signo = signr; - info->si_errno = 0; - info->si_code = sicode; - info->si_addr = (void __user *)siaddr; - return info; -} - -static void -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - - - if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) - return; /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not @@ -269,24 +223,45 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; + return -1; +} + +static void show_signal(struct task_struct *tsk, int signr, + const char *type, const char *desc, + struct pt_regs *regs, long error_code) +{ if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, + pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } +} - force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); +static void +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, int sicode, void __user *addr) +{ + struct task_struct *tsk = current; + + + if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) + return; + + show_signal(tsk, signr, "trap ", str, regs, error_code); + + if (!sicode) + force_sig(signr, tsk); + else + force_sig_fault(signr, sicode, addr, tsk); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, - unsigned long trapnr, int signr) + unsigned long trapnr, int signr, int sicode, void __user *addr) { - siginfo_t info; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); /* @@ -299,26 +274,26 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); - clear_siginfo(&info); - do_trap(trapnr, signr, str, regs, error_code, - fill_trap_info(regs, signr, trapnr, &info)); + do_trap(trapnr, signr, str, regs, error_code, sicode, addr); } } -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr); \ +#define IP ((void __user *)uprobe_get_trap_addr(regs)) +#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) -DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error) +DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) +DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) +DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check) +#undef IP #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(const char *message, @@ -383,6 +358,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * we won't enable interupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. + * + * We will enter general_protection with kernel GSBASE, + * which is what the stub expects, given that the faulting + * RIP will be the IRET instruction. */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; @@ -455,7 +434,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { const struct mpx_bndcsr *bndcsr; - siginfo_t *info; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, "bounds", regs, error_code, @@ -493,8 +471,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) goto exit_trap; break; /* Success, it was handled */ case 1: /* Bound violation. */ - info = mpx_generate_siginfo(regs); - if (IS_ERR(info)) { + { + struct task_struct *tsk = current; + struct mpx_fault_info mpx; + + if (mpx_fault_info(&mpx, regs)) { /* * We failed to decode the MPX instruction. Act as if * the exception was not caused by MPX. @@ -503,14 +484,20 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) } /* * Success, we decoded the instruction and retrieved - * an 'info' containing the address being accessed + * an 'mpx' containing the address being accessed * which caused the exception. This information * allows and application to possibly handle the * #BR exception itself. */ - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); - kfree(info); + if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs, + error_code)) + break; + + show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code); + + force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper); break; + } case 0: /* No exception caused by Intel MPX operations. */ goto exit_trap; default: @@ -527,12 +514,13 @@ exit_trap: * up here if the kernel has MPX turned off at compile * time.. */ - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); } dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { + const char *desc = "general protection fault"; struct task_struct *tsk; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); @@ -551,30 +539,33 @@ do_general_protection(struct pt_regs *regs, long error_code) tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs, X86_TRAP_GP)) + if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) return; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have to + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, X86_TRAP_GP)) + return; + + if (notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) - die("general protection fault", regs, error_code); + die(desc, regs, error_code); return; } tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(KERN_CONT " in ", regs->ip); - pr_cont("\n"); - } + show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); + force_sig(SIGSEGV, tsk); } NOKPROBE_SYMBOL(do_general_protection); @@ -617,7 +608,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) goto exit; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); cond_local_irq_disable(regs); exit: @@ -831,14 +822,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; - siginfo_t info; + int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, error_code, 0)) return; task->thread.error_code = error_code; @@ -857,18 +848,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.trap_nr = trapnr; task->thread.error_code = error_code; - clear_siginfo(&info); - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_addr = (void __user *)uprobe_get_trap_addr(regs); - - info.si_code = fpu__exception_code(fpu, trapnr); + si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ - if (!info.si_code) + if (!si_code) return; - force_sig_info(SIGFPE, &info, task); + force_sig_fault(SIGFPE, si_code, + (void __user *)uprobe_get_trap_addr(regs), task); } dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) @@ -928,20 +915,13 @@ NOKPROBE_SYMBOL(do_device_not_available); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { - siginfo_t info; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); - clear_siginfo(&info); - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, - &info); + ILL_BADSTK, (void __user *)NULL); } } #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 1463468ba9a0..e9f777bfed40 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -26,6 +26,7 @@ #include <asm/apic.h> #include <asm/intel-family.h> #include <asm/i8259.h> +#include <asm/uv/uv.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -57,7 +58,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void cyc2ns_read_begin(struct cyc2ns_data *data) +void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -74,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void cyc2ns_read_end(void) +void __always_inline cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -103,7 +104,7 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; @@ -246,7 +247,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_time_ops.sched_clock == native_sched_clock; + return pv_ops.time.sched_clock == native_sched_clock; } #else unsigned long long @@ -635,7 +636,7 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_ATOM_DENVERTON: + case INTEL_FAM6_ATOM_GOLDMONT_X: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -1415,7 +1416,7 @@ static bool __init determine_cpu_tsc_frequencies(bool early) static unsigned long __init get_loops_per_jiffy(void) { - unsigned long lpj = tsc_khz * KHZ; + u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; @@ -1433,6 +1434,9 @@ void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; + /* Don't change UV TSC multi-chassis synchronization */ + if (is_early_uv_system()) + return; if (!determine_cpu_tsc_frequencies(true)) return; loops_per_jiffy = get_loops_per_jiffy(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 27ef714d886c..3d0e9aeea7c8 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -59,12 +59,12 @@ static const struct freq_desc freq_desc_ann = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), {} }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index ff20b35e98dd..f8f3cfda01ae 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -271,19 +271,13 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, */ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) { - siginfo_t info; struct task_struct *tsk = current; tsk->thread.cr2 = (unsigned long)addr; tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; - clear_siginfo(&info); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = addr; - force_sig_info(SIGSEGV, &info, tsk); + force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, tsk); if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) return; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index deb576b23b7c..843feb94a950 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1086,7 +1086,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); - force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); + force_sig(SIGSEGV, current); } return -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 1c03e4aa6474..c2fd39752da8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..0d618ee634ac 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -118,16 +136,6 @@ SECTIONS *(.fixup) *(.gnu.warning) -#ifdef CONFIG_X86_64 - . = ALIGN(PAGE_SIZE); - __entry_trampoline_start = .; - _entry_trampoline = .; - *(.entry_trampoline) - . = ALIGN(PAGE_SIZE); - __entry_trampoline_end = .; - ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); -#endif - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.indirect_thunk) @@ -355,6 +363,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 44685fb2a192..1eae5af491c2 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,7 +26,7 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 -#if defined CONFIG_PCI && defined CONFIG_PARAVIRT +#if defined CONFIG_PCI && defined CONFIG_PARAVIRT_XXL /* * Interrupt control on vSMPowered systems: * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' @@ -69,17 +69,17 @@ asmlinkage __visible void vsmp_irq_enable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init vsmp_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) { switch (type) { - case PARAVIRT_PATCH(pv_irq_ops.irq_enable): - case PARAVIRT_PATCH(pv_irq_ops.irq_disable): - case PARAVIRT_PATCH(pv_irq_ops.save_fl): - case PARAVIRT_PATCH(pv_irq_ops.restore_fl): - return paravirt_patch_default(type, clobbers, ibuf, addr, len); + case PARAVIRT_PATCH(irq.irq_enable): + case PARAVIRT_PATCH(irq.irq_disable): + case PARAVIRT_PATCH(irq.save_fl): + case PARAVIRT_PATCH(irq.restore_fl): + return paravirt_patch_default(type, ibuf, addr, len); default: - return native_patch(type, clobbers, ibuf, addr, len); + return native_patch(type, ibuf, addr, len); } } @@ -111,11 +111,11 @@ static void __init set_vsmp_pv_ops(void) if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); - pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); - pv_init_ops.patch = vsmp_patch; + pv_ops.irq.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); + pv_ops.irq.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); + pv_ops.irq.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); + pv_ops.irq.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); + pv_ops.init.patch = vsmp_patch; ctl &= ~(1 << 4); } writel(ctl, address + 4); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 2792b5573818..50a2b492fdd6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,7 +31,6 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } static bool __init bool_x86_init_noop(void) { return false; } static void x86_op_int_noop(int cpu) { } -static u64 u64_x86_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -96,7 +95,7 @@ struct x86_init_ops x86_init __initdata = { }, .acpi = { - .get_root_pointer = u64_x86_init_noop, + .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 106482da6388..34edf198708f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2711,7 +2711,16 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) return true; - /* default: (not Intel, not AMD), apply Intel's stricter rules... */ + /* Hygon ("HygonGenuine") */ + if (ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && + ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && + edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx) + return true; + + /* + * default: (not Intel, not AMD, not Hygon), apply Intel's + * stricter rules... + */ return false; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0cefba28c864..fbb0e6df121b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -548,7 +548,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { int i; @@ -571,18 +571,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (min > map->max_apic_id) + goto out; /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + for_each_set_bit(i, &ipi_bitmap_low, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } min += cluster_size; - for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + + if (min > map->max_apic_id) + goto out; + + for_each_set_bit(i, &ipi_bitmap_high, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } +out: rcu_read_unlock(); return count; } @@ -1331,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return kvm_apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; + return addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, @@ -1345,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + memset(data, 0xff, len); + return 0; + } + kvm_lapic_reg_read(apic, offset, len, data); return 0; @@ -1904,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + return 0; + } + /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a282321329b5..e843ec46609d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + + static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | - shadow_nonpresent_or_rsvd_mask; - u64 gpa = spte & ~mask; + u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) & shadow_nonpresent_or_rsvd_mask; @@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static void kvm_mmu_reset_all_pte_masks(void) { + u8 low_phys_bits; + shadow_user_mask = 0; shadow_accessed_mask = 0; shadow_dirty_mask = 0; @@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void) * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. */ + low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_data.x86_phys_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) + 52 - shadow_nonpresent_or_rsvd_mask_len) { shadow_nonpresent_or_rsvd_mask = rsvd_bits(boot_cpu_data.x86_phys_bits - shadow_nonpresent_or_rsvd_mask_len, boot_cpu_data.x86_phys_bits - 1); + low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; + } + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } static int is_cpuid_PSE36(void) @@ -899,7 +915,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { /* * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_commit_zap_page() can see us + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); @@ -1853,11 +1869,6 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -3114,16 +3125,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) { - siginfo_t info; - - clear_siginfo(&info); - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_MCEERR_AR; - info.si_addr = (void __user *)address; - info.si_addr_lsb = PAGE_SHIFT; - - send_sig_info(SIGBUS, &info, tsk); + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); } static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) @@ -5217,7 +5219,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_RETRY; + int r, emulation_type = 0; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; @@ -5230,10 +5232,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_PF_EMULATE) { - emulation_type = 0; + if (r == RET_PF_EMULATE) goto emulate; - } } if (r == RET_PF_INVALID) { @@ -5260,8 +5260,19 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; } - if (mmio_info_in_cache(vcpu, cr2, direct)) - emulation_type = 0; + /* + * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ + if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* * On AMD platforms, under certain conditions insn_len may be zero on #NPF. @@ -5413,7 +5424,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - kvm_init_mmu(vcpu, true); + /* + * kvm_mmu_setup() is called only on vCPU initialization. + * Therefore, no need to reset mmu roots as they are not yet + * initialized. + */ + kvm_init_mmu(vcpu, false); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6276140044d0..61ccfb13899e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -436,14 +436,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) static inline bool svm_sev_enabled(void) { - return max_sev_asid; + return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; } static inline bool sev_guest(struct kvm *kvm) { +#ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; return sev->active; +#else + return false; +#endif } static inline int sev_get_asid(struct kvm *kvm) @@ -776,7 +780,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -1226,8 +1230,7 @@ static __init int sev_hardware_setup(void) min_sev_asid = cpuid_edx(0x8000001F); /* Initialize SEV ASID bitmap */ - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), - sizeof(unsigned long), GFP_KERNEL); + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; @@ -1405,7 +1408,7 @@ static __exit void svm_hardware_unsetup(void) int cpu; if (svm_sev_enabled()) - kfree(sev_asid_bitmap); + bitmap_free(sev_asid_bitmap); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -2715,7 +2718,7 @@ static int gp_interception(struct vcpu_svm *svm) WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -2819,7 +2822,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3861,7 +3864,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3869,13 +3872,13 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } static int rsm_interception(struct vcpu_svm *svm) { - return x86_emulate_instruction(&svm->vcpu, 0, 0, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, + rsm_ins_bytes, 2) == EMULATE_DONE; } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4700,7 +4703,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); } return ret; @@ -6747,7 +6750,7 @@ e_free: static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr, dst_vaddr_end; + unsigned long dst_vaddr; struct page **src_p, **dst_p; struct kvm_sev_dbg debug; unsigned long n; @@ -6763,7 +6766,6 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) size = debug.len; vaddr_end = vaddr + size; dst_vaddr = debug.dst_uaddr; - dst_vaddr_end = dst_vaddr + size; for (; vaddr < vaddr_end; vaddr = next_vaddr) { int len, s_off, d_off; @@ -7150,6 +7152,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + .request_immediate_exit = __kvm_request_immediate_exit, + .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d26f3c4985b..e665aa7167cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 -#define MSR_BITMAP_MODE_LM 4 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL @@ -397,6 +396,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -856,6 +856,7 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; u16 vpid02; u16 last_vpid; @@ -1019,6 +1020,8 @@ struct vcpu_vmx { int ple_window; bool ple_window_dirty; + bool req_immediate_exit; + /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -1569,8 +1572,12 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) goto out; } + /* + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the + * base of EPT PML4 table, strip off EPT configuration information. + */ ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); out: spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); @@ -2864,6 +2871,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; + vmx->req_immediate_exit = false; + if (vmx->loaded_cpu_state) return; @@ -2894,8 +2903,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -2946,8 +2954,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -2975,24 +2982,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, - vmx->msr_guest_kernel_gs_base); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); return vmx->msr_guest_kernel_gs_base; } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif @@ -3528,9 +3530,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (kvm_mpx_supported()) - msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; @@ -3547,8 +3546,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_ENTRY_LOAD_IA32_PAT; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (kvm_mpx_supported()) - msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -3596,12 +3593,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high); msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. @@ -3658,6 +3655,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, @@ -5068,19 +5069,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!msr) return; - /* - * MSR_KERNEL_GS_BASE is not intercepted when the guest is in - * 64-bit mode as a 64-bit kernel may frequently access the - * MSR. This means we need to manually save/restore the MSR - * when switching between guest and host state, but only if - * the guest is in 64-bit mode. Sync our cached value if the - * guest is transitioning to 32-bit mode and the CPU contains - * guest state, i.e. the cache is stale. - */ -#ifdef CONFIG_X86_64 - if (!(efer & EFER_LMA)) - (void)vmx_read_guest_kernel_gs_base(vmx); -#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -5393,9 +5381,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * To use VMXON (and later other VMX instructions), a guest * must first be able to turn on cr4.VMXE (see handle_vmon()). * So basically the check on whether to allow nested VMX - * is here. + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. */ - if (!nested_vmx_allowed(vcpu)) + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) return 1; } @@ -6072,9 +6061,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } - if (is_long_mode(vcpu)) - mode |= MSR_BITMAP_MODE_LM; - return mode; } @@ -6115,9 +6101,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) if (!changed) return; - vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, - !(mode & MSR_BITMAP_MODE_LM)); - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); @@ -6183,6 +6166,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmx_get_rvi(); + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -6983,7 +6992,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -7054,7 +7063,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -7157,7 +7166,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7231,7 +7240,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_cr(struct kvm_vcpu *vcpu) @@ -7480,7 +7489,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -7547,7 +7556,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -7704,8 +7713,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -7748,7 +7757,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, 0); + err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; @@ -7966,6 +7975,9 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { u64 vmx_msr; @@ -9208,7 +9220,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - kvm_lapic_expired_hv_timer(vcpu); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -10214,15 +10227,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -10344,6 +10358,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -10595,24 +10617,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->hv_deadline_tsc == -1) + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); return; + } - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } + + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -10672,7 +10713,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_arm_hv_timer(vcpu); + vmx_update_hv_timer(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -11214,6 +11255,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); + + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } + } +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11230,8 +11288,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (nested_vmx_allowed(vcpu)) + if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -11427,16 +11487,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } + if (vcpu->arch.virtual_tsc_khz == 0) + return; + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); @@ -11646,11 +11708,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (!nested_cpu_has_vid(vmcs12) || !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -11993,8 +12059,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (vmx_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12076,11 +12147,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -12318,6 +12388,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -12537,12 +12610,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; + bool evaluate_pending_interrupts; int r = 0; + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); @@ -12575,6 +12657,23 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) } /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + */ + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -12841,6 +12940,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -13231,12 +13335,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -13440,18 +13539,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif @@ -13932,6 +14025,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; @@ -13988,9 +14089,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) @@ -14078,6 +14176,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, @@ -14111,6 +14210,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 506bd2b4b8bb..ca717737347e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu)) + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; @@ -2780,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated && + !vcpu->kvm->arch.guest_can_read_msr_platform_info) + return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: @@ -2927,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_GET_MSR_FEATURES: + case KVM_CAP_MSR_PLATFORM_INFO: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4007,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + r = -EFAULT; if (get_user(user_data_size, &user_kvm_nested_state->size)) - return -EFAULT; + break; r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) - return r; + break; if (r > user_data_size) { if (put_user(r, &user_kvm_nested_state->size)) - return -EFAULT; - return -E2BIG; + r = -EFAULT; + else + r = -E2BIG; + break; } + r = 0; break; } @@ -4031,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!kvm_x86_ops->set_nested_state) break; + r = -EFAULT; if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) - return -EFAULT; + break; + r = -EINVAL; if (kvm_state.size < sizeof(kvm_state)) - return -EINVAL; + break; if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; + break; /* nested_run_pending implies guest_mode. */ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) - return -EINVAL; + break; r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); break; @@ -4350,6 +4359,10 @@ split_irqchip_unlock: kvm->arch.pause_in_guest = true; r = 0; break; + case KVM_CAP_MSR_PLATFORM_INFO: + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -4685,7 +4698,7 @@ static void kvm_init_msr_list(void) */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: - if (!kvm_x86_ops->mpx_supported()) + if (!kvm_mpx_supported()) continue; break; case MSR_TSC_AUX: @@ -4987,7 +5000,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = 0; } - er = emulate_instruction(vcpu, emul_type); + er = kvm_emulate_instruction(vcpu, emul_type); if (er == EMULATE_USER_EXIT) return 0; if (er != EMULATE_DONE) @@ -5870,7 +5883,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, gpa_t gpa = cr2; kvm_pfn_t pfn; - if (emulation_type & EMULTYPE_NO_REEXECUTE) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (!vcpu->arch.mmu.direct_map) { @@ -5958,7 +5974,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6276,7 +6295,19 @@ restart: return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); + +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction); + +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len) +{ + return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) @@ -7343,6 +7374,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + smp_send_reschedule(vcpu->cpu); +} +EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); + /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7547,7 +7584,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - smp_send_reschedule(vcpu->cpu); + kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -7734,7 +7771,7 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r != EMULATE_DONE) return 0; @@ -7811,6 +7848,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +/* Swap (qemu) user FPU context for the guest FPU context. */ +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); + preempt_enable(); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -8159,7 +8219,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8388,29 +8448,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -/* Swap (qemu) user FPU context for the guest FPU context. */ -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, - ~XFEATURE_MASK_PKRU); - preempt_enable(); - trace_kvm_fpu(1); -} - -/* When vcpu_run ends, restore user space FPU context. */ -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); - preempt_enable(); - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); -} - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; @@ -8834,6 +8871,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); + kvm->arch.guest_can_read_msr_platform_info = true; + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); @@ -9182,6 +9221,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_page_track_flush_slot(kvm, slot); } +static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + return (is_guest_mode(vcpu) && + kvm_x86_ops->guest_apic_has_interrupt && + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); +} + static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) @@ -9206,7 +9252,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) + (kvm_cpu_has_interrupt(vcpu) || + kvm_guest_apic_has_interrupt(vcpu))) return true; if (kvm_hv_has_stimer_pending(vcpu)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 257f27620bc2..67b9568613f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -274,6 +274,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 46e71a74e612..ad8e0906d1ea 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -273,11 +273,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define SRC(y...) \ 9999: y; \ - _ASM_EXTABLE(9999b, 6001f) + _ASM_EXTABLE_UA(9999b, 6001f) #define DST(y...) \ 9999: y; \ - _ASM_EXTABLE(9999b, 6002f) + _ASM_EXTABLE_UA(9999b, 6002f) #ifndef CONFIG_X86_USE_PPRO_CHECKSUM diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 020f75cc8cf6..db4e5aa0858b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -92,26 +92,26 @@ ENTRY(copy_user_generic_unrolled) 60: jmp copy_user_handle_tail /* ecx is zerorest also */ .previous - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) + _ASM_EXTABLE_UA(1b, 30b) + _ASM_EXTABLE_UA(2b, 30b) + _ASM_EXTABLE_UA(3b, 30b) + _ASM_EXTABLE_UA(4b, 30b) + _ASM_EXTABLE_UA(5b, 30b) + _ASM_EXTABLE_UA(6b, 30b) + _ASM_EXTABLE_UA(7b, 30b) + _ASM_EXTABLE_UA(8b, 30b) + _ASM_EXTABLE_UA(9b, 30b) + _ASM_EXTABLE_UA(10b, 30b) + _ASM_EXTABLE_UA(11b, 30b) + _ASM_EXTABLE_UA(12b, 30b) + _ASM_EXTABLE_UA(13b, 30b) + _ASM_EXTABLE_UA(14b, 30b) + _ASM_EXTABLE_UA(15b, 30b) + _ASM_EXTABLE_UA(16b, 30b) + _ASM_EXTABLE_UA(18b, 40b) + _ASM_EXTABLE_UA(19b, 40b) + _ASM_EXTABLE_UA(21b, 50b) + _ASM_EXTABLE_UA(22b, 50b) ENDPROC(copy_user_generic_unrolled) EXPORT_SYMBOL(copy_user_generic_unrolled) @@ -156,8 +156,8 @@ ENTRY(copy_user_generic_string) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,11b) - _ASM_EXTABLE(3b,12b) + _ASM_EXTABLE_UA(1b, 11b) + _ASM_EXTABLE_UA(3b, 12b) ENDPROC(copy_user_generic_string) EXPORT_SYMBOL(copy_user_generic_string) @@ -189,7 +189,7 @@ ENTRY(copy_user_enhanced_fast_string) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,12b) + _ASM_EXTABLE_UA(1b, 12b) ENDPROC(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) @@ -319,27 +319,27 @@ ENTRY(__copy_user_nocache) jmp copy_user_handle_tail .previous - _ASM_EXTABLE(1b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(2b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(3b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(4b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(5b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(6b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(7b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(8b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(9b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(10b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(11b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(12b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(13b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(14b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(15b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(16b,.L_fixup_4x8b_copy) - _ASM_EXTABLE(20b,.L_fixup_8b_copy) - _ASM_EXTABLE(21b,.L_fixup_8b_copy) - _ASM_EXTABLE(30b,.L_fixup_4b_copy) - _ASM_EXTABLE(31b,.L_fixup_4b_copy) - _ASM_EXTABLE(40b,.L_fixup_1b_copy) - _ASM_EXTABLE(41b,.L_fixup_1b_copy) + _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy) + _ASM_EXTABLE_UA(20b, .L_fixup_8b_copy) + _ASM_EXTABLE_UA(21b, .L_fixup_8b_copy) + _ASM_EXTABLE_UA(30b, .L_fixup_4b_copy) + _ASM_EXTABLE_UA(31b, .L_fixup_4b_copy) + _ASM_EXTABLE_UA(40b, .L_fixup_1b_copy) + _ASM_EXTABLE_UA(41b, .L_fixup_1b_copy) ENDPROC(__copy_user_nocache) EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 45a53dfe1859..a4a379e79259 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -31,14 +31,18 @@ .macro source 10: - _ASM_EXTABLE(10b, .Lbad_source) + _ASM_EXTABLE_UA(10b, .Lbad_source) .endm .macro dest 20: - _ASM_EXTABLE(20b, .Lbad_dest) + _ASM_EXTABLE_UA(20b, .Lbad_dest) .endm + /* + * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a + * potentially unmapped kernel address. + */ .macro ignore L=.Lignore 30: _ASM_EXTABLE(30b, \L) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 49b167f73215..74fdff968ea3 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -132,12 +132,12 @@ bad_get_user_8: END(bad_get_user_8) #endif - _ASM_EXTABLE(1b,bad_get_user) - _ASM_EXTABLE(2b,bad_get_user) - _ASM_EXTABLE(3b,bad_get_user) + _ASM_EXTABLE_UA(1b, bad_get_user) + _ASM_EXTABLE_UA(2b, bad_get_user) + _ASM_EXTABLE_UA(3b, bad_get_user) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b,bad_get_user) + _ASM_EXTABLE_UA(4b, bad_get_user) #else - _ASM_EXTABLE(4b,bad_get_user_8) - _ASM_EXTABLE(5b,bad_get_user_8) + _ASM_EXTABLE_UA(4b, bad_get_user_8) + _ASM_EXTABLE_UA(5b, bad_get_user_8) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 96dce5fe2a35..d2e5c9c39601 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -94,10 +94,10 @@ bad_put_user: EXIT END(bad_put_user) - _ASM_EXTABLE(1b,bad_put_user) - _ASM_EXTABLE(2b,bad_put_user) - _ASM_EXTABLE(3b,bad_put_user) - _ASM_EXTABLE(4b,bad_put_user) + _ASM_EXTABLE_UA(1b, bad_put_user) + _ASM_EXTABLE_UA(2b, bad_put_user) + _ASM_EXTABLE_UA(3b, bad_put_user) + _ASM_EXTABLE_UA(4b, bad_put_user) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(5b,bad_put_user) + _ASM_EXTABLE_UA(5b, bad_put_user) #endif diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7add8ba06887..71fb58d44d58 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -47,8 +47,8 @@ do { \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,2b) \ + _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ } while (0) @@ -153,44 +153,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) "101: lea 0(%%eax,%0,4),%0\n" " jmp 100b\n" ".previous\n" - _ASM_EXTABLE(1b,100b) - _ASM_EXTABLE(2b,100b) - _ASM_EXTABLE(3b,100b) - _ASM_EXTABLE(4b,100b) - _ASM_EXTABLE(5b,100b) - _ASM_EXTABLE(6b,100b) - _ASM_EXTABLE(7b,100b) - _ASM_EXTABLE(8b,100b) - _ASM_EXTABLE(9b,100b) - _ASM_EXTABLE(10b,100b) - _ASM_EXTABLE(11b,100b) - _ASM_EXTABLE(12b,100b) - _ASM_EXTABLE(13b,100b) - _ASM_EXTABLE(14b,100b) - _ASM_EXTABLE(15b,100b) - _ASM_EXTABLE(16b,100b) - _ASM_EXTABLE(17b,100b) - _ASM_EXTABLE(18b,100b) - _ASM_EXTABLE(19b,100b) - _ASM_EXTABLE(20b,100b) - _ASM_EXTABLE(21b,100b) - _ASM_EXTABLE(22b,100b) - _ASM_EXTABLE(23b,100b) - _ASM_EXTABLE(24b,100b) - _ASM_EXTABLE(25b,100b) - _ASM_EXTABLE(26b,100b) - _ASM_EXTABLE(27b,100b) - _ASM_EXTABLE(28b,100b) - _ASM_EXTABLE(29b,100b) - _ASM_EXTABLE(30b,100b) - _ASM_EXTABLE(31b,100b) - _ASM_EXTABLE(32b,100b) - _ASM_EXTABLE(33b,100b) - _ASM_EXTABLE(34b,100b) - _ASM_EXTABLE(35b,100b) - _ASM_EXTABLE(36b,100b) - _ASM_EXTABLE(37b,100b) - _ASM_EXTABLE(99b,101b) + _ASM_EXTABLE_UA(1b, 100b) + _ASM_EXTABLE_UA(2b, 100b) + _ASM_EXTABLE_UA(3b, 100b) + _ASM_EXTABLE_UA(4b, 100b) + _ASM_EXTABLE_UA(5b, 100b) + _ASM_EXTABLE_UA(6b, 100b) + _ASM_EXTABLE_UA(7b, 100b) + _ASM_EXTABLE_UA(8b, 100b) + _ASM_EXTABLE_UA(9b, 100b) + _ASM_EXTABLE_UA(10b, 100b) + _ASM_EXTABLE_UA(11b, 100b) + _ASM_EXTABLE_UA(12b, 100b) + _ASM_EXTABLE_UA(13b, 100b) + _ASM_EXTABLE_UA(14b, 100b) + _ASM_EXTABLE_UA(15b, 100b) + _ASM_EXTABLE_UA(16b, 100b) + _ASM_EXTABLE_UA(17b, 100b) + _ASM_EXTABLE_UA(18b, 100b) + _ASM_EXTABLE_UA(19b, 100b) + _ASM_EXTABLE_UA(20b, 100b) + _ASM_EXTABLE_UA(21b, 100b) + _ASM_EXTABLE_UA(22b, 100b) + _ASM_EXTABLE_UA(23b, 100b) + _ASM_EXTABLE_UA(24b, 100b) + _ASM_EXTABLE_UA(25b, 100b) + _ASM_EXTABLE_UA(26b, 100b) + _ASM_EXTABLE_UA(27b, 100b) + _ASM_EXTABLE_UA(28b, 100b) + _ASM_EXTABLE_UA(29b, 100b) + _ASM_EXTABLE_UA(30b, 100b) + _ASM_EXTABLE_UA(31b, 100b) + _ASM_EXTABLE_UA(32b, 100b) + _ASM_EXTABLE_UA(33b, 100b) + _ASM_EXTABLE_UA(34b, 100b) + _ASM_EXTABLE_UA(35b, 100b) + _ASM_EXTABLE_UA(36b, 100b) + _ASM_EXTABLE_UA(37b, 100b) + _ASM_EXTABLE_UA(99b, 101b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -259,26 +259,26 @@ static unsigned long __copy_user_intel_nocache(void *to, "9: lea 0(%%eax,%0,4),%0\n" "16: jmp 8b\n" ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) + _ASM_EXTABLE_UA(0b, 16b) + _ASM_EXTABLE_UA(1b, 16b) + _ASM_EXTABLE_UA(2b, 16b) + _ASM_EXTABLE_UA(21b, 16b) + _ASM_EXTABLE_UA(3b, 16b) + _ASM_EXTABLE_UA(31b, 16b) + _ASM_EXTABLE_UA(4b, 16b) + _ASM_EXTABLE_UA(41b, 16b) + _ASM_EXTABLE_UA(10b, 16b) + _ASM_EXTABLE_UA(51b, 16b) + _ASM_EXTABLE_UA(11b, 16b) + _ASM_EXTABLE_UA(61b, 16b) + _ASM_EXTABLE_UA(12b, 16b) + _ASM_EXTABLE_UA(71b, 16b) + _ASM_EXTABLE_UA(13b, 16b) + _ASM_EXTABLE_UA(81b, 16b) + _ASM_EXTABLE_UA(14b, 16b) + _ASM_EXTABLE_UA(91b, 16b) + _ASM_EXTABLE_UA(6b, 9b) + _ASM_EXTABLE_UA(7b, 16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -321,9 +321,9 @@ do { \ "3: lea 0(%3,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(4b,5b) \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,2b) \ + _ASM_EXTABLE_UA(4b, 5b) \ + _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 9c5606d88f61..1bd837cdc4b1 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -37,8 +37,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(0b,3b) - _ASM_EXTABLE(1b,2b) + _ASM_EXTABLE_UA(0b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); clac(); @@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) return rc; } -void memcpy_flushcache(void *_dst, const void *_src, size_t size) +void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; @@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size) clean_cache_range((void *) dest, size); } } -EXPORT_SYMBOL_GPL(memcpy_flushcache); +EXPORT_SYMBOL_GPL(__memcpy_flushcache); void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 076ebdce9bd4..12d7e7fb4efd 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -15,7 +15,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -83,8 +82,6 @@ static void percpu_setup_debug_store(int cpu) static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; pgprot_t tss_prot = PAGE_KERNEL_RO; @@ -146,43 +143,10 @@ static void __init setup_cpu_entry_area(int cpu) cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); - - cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); - /* - * The cpu_entry_area alias addresses are not in the kernel binary - * so they do not show up in /proc/kcore normally. This adds entries - * for them manually. - */ - kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu), - _entry_trampoline, - &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE); #endif percpu_setup_debug_store(cpu); } -#ifdef CONFIG_X86_64 -int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name) -{ - unsigned int cpu, ncpu = 0; - - if (symnum >= num_possible_cpus()) - return -EINVAL; - - for_each_possible_cpu(cpu) { - if (ncpu++ >= symnum) - break; - } - - *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; - *type = 't'; - strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); - - return 0; -} -#endif - static __init void setup_cpu_entry_area_ptes(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a12afff146d1..fc37bbd23eb8 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -19,7 +19,9 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/highmem.h> +#include <linux/pci.h> +#include <asm/e820/types.h> #include <asm/pgtable.h> /* @@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u) return (signed long)(u << shift) >> shift; } +static void note_wx(struct pg_state *st) +{ + unsigned long npages; + + npages = (st->current_address - st->start_address) / PAGE_SIZE; + +#ifdef CONFIG_PCI_BIOS + /* + * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. + * Inform about it, but avoid the warning. + */ + if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && + st->current_address <= PAGE_OFFSET + BIOS_END) { + pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); + return; + } +#endif + /* Account the WX pages */ + st->wx_pages += npages; + WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); +} + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, unsigned long delta; int width = sizeof(unsigned long) * 2; - if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { - WARN_ONCE(1, - "x86/mm: Found insecure W+X mapping at address %p/%pS\n", - (void *)st->start_address, - (void *)st->start_address); - st->wx_pages += (st->current_address - - st->start_address) / PAGE_SIZE; - } + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) + note_wx(st); /* * Now print the actual finished series diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 45f5d6cf65ae..6521134057e8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -8,7 +8,8 @@ #include <asm/kdebug.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, - struct pt_regs *, int); + struct pt_regs *, int, unsigned long, + unsigned long); static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) @@ -22,7 +23,9 @@ ex_fixup_handler(const struct exception_table_entry *x) } __visible bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); return true; @@ -30,7 +33,9 @@ __visible bool ex_handler_default(const struct exception_table_entry *fixup, EXPORT_SYMBOL(ex_handler_default); __visible bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; @@ -43,7 +48,9 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * result of a refcount inc/dec/add/sub. */ __visible bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { /* First unconditionally saturate the refcount. */ *(int *)regs->cx = INT_MIN / 2; @@ -96,7 +103,9 @@ EXPORT_SYMBOL(ex_handler_refcount); * out all the FPU registers) if we can't restore from the task's FPU state. */ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { regs->ip = ex_fixup_addr(fixup); @@ -108,9 +117,79 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fprestore); +/* Helper to check whether a uaccess fault indicates a kernel bug. */ +static bool bogus_uaccess(struct pt_regs *regs, int trapnr, + unsigned long fault_addr) +{ + /* This is the normal case: #PF with a fault address in userspace. */ + if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX) + return false; + + /* + * This code can be reached for machine checks, but only if the #MC + * handler has already decided that it looks like a candidate for fixup. + * This e.g. happens when attempting to access userspace memory which + * the CPU can't access because of uncorrectable bad memory. + */ + if (trapnr == X86_TRAP_MC) + return false; + + /* + * There are two remaining exception types we might encounter here: + * - #PF for faulting accesses to kernel addresses + * - #GP for faulting accesses to noncanonical addresses + * Complain about anything else. + */ + if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) { + WARN(1, "unexpected trap %d in uaccess\n", trapnr); + return false; + } + + /* + * This is a faulting memory access in kernel space, on a kernel + * address, in a usercopy function. This can e.g. be caused by improper + * use of helpers like __put_user and by improper attempts to access + * userspace addresses in KERNEL_DS regions. + * The one (semi-)legitimate exception are probe_kernel_{read,write}(), + * which can be invoked from places like kgdb, /dev/mem (for reading) + * and privileged BPF code (for reading). + * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag + * to tell us that faulting on kernel addresses, and even noncanonical + * addresses, in a userspace accessor does not necessarily imply a + * kernel bug, root might just be doing weird stuff. + */ + if (current->kernel_uaccess_faults_ok) + return false; + + /* This is bad. Refuse the fixup so that we go into die(). */ + if (trapnr == X86_TRAP_PF) { + pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n", + fault_addr); + } else { + pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n"); + } + return true; +} + +__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + if (bogus_uaccess(regs, trapnr, fault_addr)) + return false; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_uaccess); + __visible bool ex_handler_ext(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { + if (bogus_uaccess(regs, trapnr, fault_addr)) + return false; /* Special hack for uaccess_err */ current->thread.uaccess_err = 1; regs->ip = ex_fixup_addr(fixup); @@ -119,7 +198,9 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup, EXPORT_SYMBOL(ex_handler_ext); __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) @@ -134,7 +215,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, @@ -148,12 +231,14 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); - return ex_handler_default(fixup, regs, trapnr); + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_clear_fs); @@ -170,7 +255,8 @@ __visible bool ex_has_fault_handler(unsigned long ip) return handler == ex_handler_fault; } -int fixup_exception(struct pt_regs *regs, int trapnr) +int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, + unsigned long fault_addr) { const struct exception_table_entry *e; ex_handler_t handler; @@ -194,7 +280,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return 0; handler = ex_fixup_handler(e); - return handler(e, regs, trapnr); + return handler(e, regs, trapnr, error_code, fault_addr); } extern unsigned int early_recursion_flag; @@ -230,9 +316,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * result in a hard-to-debug panic. * * Keep in mind that not all vectors actually get here. Early - * fage faults, for example, are special. + * page faults, for example, are special. */ - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) return; if (fixup_bug(regs, trapnr)) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 47bebfe6efa7..b24eb4eb9984 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/efi.h> /* efi_recover_from_page_fault()*/ #include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ @@ -25,6 +26,7 @@ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/efi.h> /* efi_recover_from_page_fault()*/ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -44,17 +46,19 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) static nokprobe_inline int kprobes_fault(struct pt_regs *regs) { - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; + if (!kprobes_built_in()) + return 0; + if (user_mode(regs)) + return 0; + /* + * To be potentially processing a kprobe fault and to be allowed to call + * kprobe_running(), we have to be non-preemptible. + */ + if (preemptible()) + return 0; + if (!kprobe_running()) + return 0; + return kprobe_fault_handler(regs, X86_TRAP_PF); } /* @@ -153,79 +157,6 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) return prefetch; } -/* - * A protection key fault means that the PKRU value did not allow - * access to some PTE. Userspace can figure out what PKRU was - * from the XSAVE state, and this function fills out a field in - * siginfo so userspace can discover which protection key was set - * on the PTE. - * - * If we get here, we know that the hardware signaled a X86_PF_PK - * fault and that there was a VMA once we got in the fault - * handler. It does *not* guarantee that the VMA we find here - * was the one that we faulted on. - * - * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); - * 2. T1 : set PKRU to deny access to pkey=4, touches page - * 3. T1 : faults... - * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); - * 5. T1 : enters fault handler, takes mmap_sem, etc... - * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really - * faulted on a pte with its pkey=4. - */ -static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, - u32 *pkey) -{ - /* This is effectively an #ifdef */ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return; - - /* Fault not from Protection Keys: nothing to do */ - if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) - return; - /* - * force_sig_info_fault() is called from a number of - * contexts, some of which have a VMA and some of which - * do not. The X86_PF_PK handing happens after we have a - * valid VMA, so we should never reach this without a - * valid VMA. - */ - if (!pkey) { - WARN_ONCE(1, "PKU fault with no VMA passed in"); - info->si_pkey = 0; - return; - } - /* - * si_pkey should be thought of as a strong hint, but not - * absolutely guranteed to be 100% accurate because of - * the race explained above. - */ - info->si_pkey = *pkey; -} - -static void -force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, u32 *pkey, int fault) -{ - unsigned lsb = 0; - siginfo_t info; - - clear_siginfo(&info); - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - if (fault & VM_FAULT_HWPOISON_LARGE) - lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - if (fault & VM_FAULT_HWPOISON) - lsb = PAGE_SHIFT; - info.si_addr_lsb = lsb; - - fill_sig_info_pkey(si_signo, si_code, &info, pkey); - - force_sig_info(si_signo, &info, tsk); -} - DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); @@ -709,7 +640,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, int sig; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF)) { + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from @@ -730,8 +661,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ - force_sig_info_fault(signal, si_code, address, - tsk, NULL, 0); + force_sig_fault(signal, si_code, (void __user *)address, + tsk); } /* @@ -789,6 +720,13 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; /* + * Buggy firmware could access regions which might page fault, try to + * recover from such faults. + */ + if (IS_ENABLED(CONFIG_EFI)) + efi_recover_from_page_fault(address); + + /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ @@ -840,9 +778,18 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, int si_code) + unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; @@ -863,18 +810,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; -#ifdef CONFIG_X86_64 - /* - * Instruction fetch faults in the vsyscall page might need - * emulation. - */ - if (unlikely((error_code & X86_PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_ADDR))) { - if (emulate_vsyscall(regs, address)) - return; - } -#endif - /* * To avoid leaking information about the kernel page table * layout, pretend that user-mode accesses to kernel addresses @@ -890,7 +825,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); + + force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); return; } @@ -903,35 +841,29 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey) + unsigned long address) { - __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, int si_code) + unsigned long address, u32 pkey, int si_code) { struct mm_struct *mm = current->mm; - u32 pkey; - - if (vma) - pkey = vma_pkey(vma); - /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, - (vma) ? &pkey : NULL, si_code); + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static noinline void bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); + __bad_area(regs, error_code, address, 0, SEGV_MAPERR); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, @@ -960,18 +892,40 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ - if (bad_area_access_from_pkeys(error_code, vma)) - __bad_area(regs, error_code, address, vma, SEGV_PKUERR); - else - __bad_area(regs, error_code, address, vma, SEGV_ACCERR); + if (bad_area_access_from_pkeys(error_code, vma)) { + /* + * A protection key fault means that the PKRU value did not allow + * access to some PTE. Userspace can figure out what PKRU was + * from the XSAVE state. This function captures the pkey from + * the vma and passes it to userspace so userspace can discover + * which protection key was set on the PTE. + * + * If we get here, we know that the hardware signaled a X86_PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set PKRU to deny access to pkey=4, touches page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ + u32 pkey = vma_pkey(vma); + + __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); + } else { + __bad_area(regs, error_code, address, 0, SEGV_ACCERR); + } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - u32 *pkey, unsigned int fault) + unsigned int fault) { struct task_struct *tsk = current; - int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ if (!(error_code & X86_PF_USER)) { @@ -989,18 +943,25 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { - printk(KERN_ERR + unsigned lsb = 0; + + pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); - code = BUS_MCEERR_AR; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk); + return; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, vm_fault_t fault) + unsigned long address, vm_fault_t fault) { if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1024,27 +985,21 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, pkey, fault); + do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, pkey); + bad_area_nosemaphore(regs, error_code, address); else BUG(); } } -static int spurious_fault_check(unsigned long error_code, pte_t *pte) +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; - /* - * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set X86_PF_PK. - */ - if ((error_code & X86_PF_PK)) - return 1; return 1; } @@ -1071,7 +1026,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * (Optional Invalidation). */ static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; @@ -1102,27 +1057,27 @@ spurious_fault(unsigned long error_code, unsigned long address) return 0; if (p4d_large(*p4d)) - return spurious_fault_check(error_code, (pte_t *) p4d); + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) - return spurious_fault_check(error_code, (pte_t *) pud); + return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) - return spurious_fault_check(error_code, (pte_t *) pmd); + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - ret = spurious_fault_check(error_code, pte); + ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; @@ -1130,12 +1085,12 @@ spurious_fault(unsigned long error_code, unsigned long address) * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ - ret = spurious_fault_check(error_code, (pte_t *) pmd); + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } -NOKPROBE_SYMBOL(spurious_fault); +NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; @@ -1182,6 +1137,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { + /* + * On 64-bit systems, the vsyscall page is at an address above + * TASK_SIZE_MAX, but is not considered part of the kernel + * address space. + */ + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) + return false; + return address >= TASK_SIZE_MAX; } @@ -1203,31 +1166,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) } /* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. + * Called for all faults where 'address' is part of the kernel address + * space. Might get called for faults that originate from *code* that + * ran in userspace or the kernel. */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) { - struct vm_area_struct *vma; - struct task_struct *tsk; - struct mm_struct *mm; - vm_fault_t fault, major = 0; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - u32 pkey; - - tsk = current; - mm = tsk->mm; - - prefetchw(&mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; + /* + * Protection keys exceptions only happen on user pages. We + * have no user pages in the kernel portion of the address + * space, so do not expect them here. + */ + WARN_ON_ONCE(hw_error_code & X86_PF_PK); /* - * We fault-in kernel-space virtual memory on-demand. The + * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may @@ -1235,41 +1190,73 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * only copy the information from the master page table, * nothing more. * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). */ - if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(error_code, address)) + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) return; + } - /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock: - */ - bad_area_nosemaphore(regs, error_code, address, NULL); + /* Was the fault spurious, caused by lazy TLB invalidation? */ + if (spurious_kernel_fault(hw_error_code, address)) + return; + /* kprobes don't want to hook the spurious faults: */ + if (kprobes_fault(regs)) return; - } + + /* + * Note, despite being a "bad area", there are quite a few + * acceptable reasons to get here, such as erratum fixups + * and handling kernel code that can fault, like get_user(). + * + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, hw_error_code, address); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + +/* Handle faults in the user portion of the address space */ +static inline +void do_user_addr_fault(struct pt_regs *regs, + unsigned long hw_error_code, + unsigned long address) +{ + unsigned long sw_error_code; + struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + vm_fault_t fault, major = 0; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + tsk = current; + mm = tsk->mm; /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & X86_PF_RSVD)) - pgtable_bad(regs, error_code, address); + /* + * Reserved bits are never expected to be set on + * entries in the user portion of the page tables. + */ + if (unlikely(hw_error_code & X86_PF_RSVD)) + pgtable_bad(regs, hw_error_code, address); - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address, NULL); + /* + * Check for invalid kernel (supervisor) access to user + * pages in the user address space. + */ + if (unlikely(smap_violation(hw_error_code, regs))) { + bad_area_nosemaphore(regs, hw_error_code, address); return; } @@ -1278,11 +1265,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address); return; } /* + * hw_error_code is literally the "page fault error code" passed to + * the kernel directly from the hardware. But, we will shortly be + * modifying it in software, so give it a new name. + */ + sw_error_code = hw_error_code; + + /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * @@ -1291,7 +1285,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= X86_PF_USER; + /* + * Up to this point, X86_PF_USER set in hw_error_code + * indicated a user-mode access. But, after this, + * X86_PF_USER in sw_error_code will indicate either + * that, *or* an implicit kernel(supervisor)-mode access + * which originated from user mode. + */ + if (!(hw_error_code & X86_PF_USER)) { + /* + * The CPU was in user mode, but the CPU says + * the fault was not a user-mode access. + * Must be an implicit kernel-mode access, + * which we do not expect to happen in the + * user address space. + */ + pr_warn_once("kernel-mode error from user-mode: %lx\n", + hw_error_code); + + sw_error_code |= X86_PF_USER; + } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1300,31 +1313,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & X86_PF_WRITE) + if (sw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & X86_PF_INSTR) + if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. The vsyscall page is at a high address + * (>PAGE_OFFSET), but is considered to be part of the user + * address space. + * + * The vsyscall page does not have a "real" VMA, so do this + * emulation before we go searching for VMAs. + */ + if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + /* - * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in - * the kernel and should generate an OOPS. Unfortunately, in the - * case of an erroneous fault occurring in a code path which already - * holds mmap_sem we will deadlock attempting to validate the fault - * against the address space. Luckily the kernel only validly - * references user space from well defined areas of code, which are - * listed in the exceptions table. + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception + * tables. But, an erroneous kernel fault occurring outside one of + * those areas which also holds mmap_sem might deadlock attempting + * to validate the fault against the address space. * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a - * deadlock. Attempt to lock the address space, if we cannot we then - * validate the source. If this is invalid we can skip the address - * space check, thus avoiding the deadlock: + * Only do the expensive exception table search when we might be at + * risk of a deadlock. This happens if we + * 1. Failed to acquire mmap_sem, and + * 2. The access did not originate in userspace. Note: either the + * hardware or earlier page fault code may set X86_PF_USER + * in sw_error_code. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(error_code & X86_PF_USER) && + if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + /* + * Fault from code in kernel from + * which we do not expect faults. + */ + bad_area_nosemaphore(regs, sw_error_code, address); return; } retry: @@ -1340,16 +1371,16 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } - if (error_code & X86_PF_USER) { + if (sw_error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter @@ -1357,12 +1388,12 @@ retry: * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } @@ -1371,8 +1402,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(error_code, vma))) { - bad_area_access_error(regs, error_code, address, vma); + if (unlikely(access_error(sw_error_code, vma))) { + bad_area_access_error(regs, sw_error_code, address, vma); return; } @@ -1388,10 +1419,7 @@ good_area: * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. - * Thus we have to be careful about not touching vma after handling the - * fault, so we read the pkey beforehand. */ - pkey = vma_pkey(vma); fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; @@ -1414,13 +1442,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, &pkey, fault); + mm_fault_error(regs, sw_error_code, address, fault); return; } @@ -1438,6 +1466,28 @@ good_area: check_v8086_mode(regs, address, tsk); } +NOKPROBE_SYMBOL(do_user_addr_fault); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + prefetchw(¤t->mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); +} NOKPROBE_SYMBOL(__do_page_fault); static nokprobe_inline void diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 979e0a02cbe1..142c7d9f89cc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -923,34 +923,19 @@ static void mark_nxdata_nx(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; + unsigned long size = (unsigned long)__end_rodata - start; set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", + pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - - start += size; - size = (unsigned long)__end_rodata - start; - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - size >> 10); - -#ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size); set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); + pr_info("Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c63a545ec199..24e0920a9b25 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, void *caller) + unsigned long size, enum page_cache_mode pcm, + void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; @@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if (sev_active() && mem_flags.desc_other) + if ((sev_active() && mem_flags.desc_other) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { @@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_nocache); @@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL_GPL(ioremap_uc); @@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wc); @@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wt); +void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0), true); +} +EXPORT_SYMBOL(ioremap_encrypted); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_cache); @@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, { return __ioremap_caller(phys_addr, size, pgprot2cachemode(__pgprot(prot_val)), - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 7ae36868aed2..a19ef1a416ff 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -27,6 +27,7 @@ * be extended when new paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS #include <linux/kernel.h> diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e500949bae24..2385538e8065 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -118,14 +118,11 @@ bad_opcode: * anything it wants in to the instructions. We can not * trust anything about it. They might not be valid * instructions or might encode invalid registers, etc... - * - * The caller is expected to kfree() the returned siginfo_t. */ -siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) +int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) { const struct mpx_bndreg_state *bndregs; const struct mpx_bndreg *bndreg; - siginfo_t *info = NULL; struct insn insn; uint8_t bndregno; int err; @@ -153,11 +150,6 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) /* now go select the individual register in the set of 4 */ bndreg = &bndregs->bndreg[bndregno]; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - err = -ENOMEM; - goto err_out; - } /* * The registers are always 64-bit, but the upper 32 * bits are ignored in 32-bit mode. Also, note that the @@ -168,27 +160,23 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * complains when casting from integers to different-size * pointers. */ - info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; - info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; - info->si_addr_lsb = 0; - info->si_signo = SIGSEGV; - info->si_errno = 0; - info->si_code = SEGV_BNDERR; - info->si_addr = insn_get_addr_ref(&insn, regs); + info->lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->addr = insn_get_addr_ref(&insn, regs); + /* * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void __user *)-1) { + if (info->addr == (void __user *)-1) { err = -EINVAL; goto err_out; } - trace_mpx_bounds_register_exception(info->si_addr, bndreg); - return info; + trace_mpx_bounds_register_exception(info->addr, bndreg); + return 0; err_out: /* info might be NULL, but kfree() handles that */ - kfree(info); - return ERR_PTR(err); + return err; } static __user void *mpx_get_bounds_dir(void) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 51a5a69ecac9..62bb30b4bd2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -37,11 +37,20 @@ struct cpa_data { unsigned long numpages; int flags; unsigned long pfn; - unsigned force_split : 1; + unsigned force_split : 1, + force_static_prot : 1; int curpage; struct page **pages; }; +enum cpa_warn { + CPA_CONFLICT, + CPA_PROTECT, + CPA_DETECT, +}; + +static const int cpa_warn_level = CPA_PROTECT; + /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb @@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m) static inline void split_page_count(int level) { } #endif +#ifdef CONFIG_X86_CPA_STATISTICS + +static unsigned long cpa_1g_checked; +static unsigned long cpa_1g_sameprot; +static unsigned long cpa_1g_preserved; +static unsigned long cpa_2m_checked; +static unsigned long cpa_2m_sameprot; +static unsigned long cpa_2m_preserved; +static unsigned long cpa_4k_install; + +static inline void cpa_inc_1g_checked(void) +{ + cpa_1g_checked++; +} + +static inline void cpa_inc_2m_checked(void) +{ + cpa_2m_checked++; +} + +static inline void cpa_inc_4k_install(void) +{ + cpa_4k_install++; +} + +static inline void cpa_inc_lp_sameprot(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_sameprot++; + else + cpa_2m_sameprot++; +} + +static inline void cpa_inc_lp_preserved(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_preserved++; + else + cpa_2m_preserved++; +} + +static int cpastats_show(struct seq_file *m, void *p) +{ + seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); + seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); + seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); + seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); + seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); + seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); + seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); + return 0; +} + +static int cpastats_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpastats_show, NULL); +} + +static const struct file_operations cpastats_fops = { + .open = cpastats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init cpa_stats_init(void) +{ + debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, + &cpastats_fops); + return 0; +} +late_initcall(cpa_stats_init); +#else +static inline void cpa_inc_1g_checked(void) { } +static inline void cpa_inc_2m_checked(void) { } +static inline void cpa_inc_4k_install(void) { } +static inline void cpa_inc_lp_sameprot(int level) { } +static inline void cpa_inc_lp_preserved(int level) { } +#endif + + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_range(void *arg) +static bool __cpa_flush_range(unsigned long start, int numpages, int cache) { - /* - * We could optimize that further and do individual per page - * tlb invalidates for a low number of pages. Caveat: we must - * flush the high aliases on 64bit as well. - */ - __flush_tlb_all(); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + WARN_ON(PAGE_ALIGN(start) != start); + + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return true; + } + + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); + + return !cache; } static void cpa_flush_range(unsigned long start, int numpages, int cache) @@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - WARN_ON(PAGE_ALIGN(start) != start); - - on_each_cpu(__cpa_flush_range, NULL, 1); - - if (!cache) + if (__cpa_flush_range(start, numpages, cache)) return; /* @@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache, +static void cpa_flush_array(unsigned long baddr, unsigned long *start, + int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; -#ifdef CONFIG_PREEMPT - /* - * Avoid wbinvd() because it causes latencies on all CPUs, - * regardless of any CPU isolation that may be in effect. - * - * This should be extended for CAT enabled systems independent of - * PREEMPT because wbinvd() does not respect the CAT partitions and - * this is exposed to unpriviledged users through the graphics - * subsystem. - */ - unsigned long do_wbinvd = 0; -#else - unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ -#endif - - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); - - if (!cache || do_wbinvd) + if (__cpa_flush_range(baddr, numpages, cache)) return; /* @@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, } } -/* - * Certain areas of memory on x86 require very specific protection flags, - * for example the BIOS area or kernel text. Callers don't always get this - * right (again, ioremap() on BIOS memory is not uncommon) so this function - * checks and fixes these known static required protection bits. - */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, - unsigned long pfn) +static bool overlaps(unsigned long r1_start, unsigned long r1_end, + unsigned long r2_start, unsigned long r2_end) { - pgprot_t forbidden = __pgprot(0); + return (r1_start <= r2_end && r1_end >= r2_start) || + (r2_start <= r1_end && r2_end >= r1_start); +} - /* - * The BIOS area between 640k and 1Mb needs to be executable for - * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. - */ #ifdef CONFIG_PCI_BIOS - if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_NX; +/* + * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS + * based config access (CONFIG_PCI_GOBIOS) support. + */ +#define BIOS_PFN PFN_DOWN(BIOS_BEGIN) +#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) + +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) + return _PAGE_NX; + return 0; +} +#else +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + return 0; +} #endif - /* - * The kernel text needs to be executable for obvious reasons - * Does not cover __inittext since that is gone later on. On - * 64bit we do not enforce !NX on the low mapping - */ - if (within(address, (unsigned long)_text, (unsigned long)_etext)) - pgprot_val(forbidden) |= _PAGE_NX; +/* + * The .rodata section needs to be read-only. Using the pfn catches all + * aliases. This also includes __ro_after_init, so do not enforce until + * kernel_set_to_readonly is true. + */ +static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) +{ + unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* - * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. This also includes __ro_after_init, - * so do not enforce until kernel_set_to_readonly is true. + * Note: __end_rodata is at page aligned and not inclusive, so + * subtract 1 to get the last enforced PFN in the rodata area. */ - if (kernel_set_to_readonly && - within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, - __pa_symbol(__end_rodata) >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_RW; + epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; + + if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) + return _PAGE_RW; + return 0; +} + +/* + * Protect kernel text against becoming non executable by forbidding + * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) + * out of which the kernel actually executes. Do not protect the low + * mapping. + * + * This does not cover __inittext since that is gone after boot. + */ +static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) +{ + unsigned long t_end = (unsigned long)_etext - 1; + unsigned long t_start = (unsigned long)_text; + + if (overlaps(start, end, t_start, t_end)) + return _PAGE_NX; + return 0; +} #if defined(CONFIG_X86_64) +/* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering the + * holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data at no + * extra cost. + */ +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; + unsigned long t_start = (unsigned long)_text; + unsigned int level; + + if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) + return 0; /* - * Once the kernel maps the text as RO (kernel_set_to_readonly is set), - * kernel text mappings for the large page aligned text, rodata sections - * will be always read-only. For the kernel identity mappings covering - * the holes caused by this alignment can be anything that user asks. + * Don't enforce the !RW mapping for the kernel text mapping, if + * the current mapping is already using small page mapping. No + * need to work hard to preserve large page mappings in this case. * - * This will preserve the large page mappings for kernel text/data - * at no extra cost. + * This also fixes the Linux Xen paravirt guest boot failure caused + * by unexpected read-only mappings for kernel identity + * mappings. In this paravirt guest case, the kernel text mapping + * and the kernel identity mapping share the same page-table pages, + * so the protections for kernel text and identity mappings have to + * be the same. */ - if (kernel_set_to_readonly && - within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) { - unsigned int level; - - /* - * Don't enforce the !RW mapping for the kernel text mapping, - * if the current mapping is already using small page mapping. - * No need to work hard to preserve large page mappings in this - * case. - * - * This also fixes the Linux Xen paravirt guest boot failure - * (because of unexpected read-only mappings for kernel identity - * mappings). In this paravirt guest case, the kernel text - * mapping and the kernel identity mapping share the same - * page-table pages. Thus we can't really use different - * protections for the kernel text and identity mappings. Also, - * these shared mappings are made of small page mappings. - * Thus this don't enforce !RW mapping for small page kernel - * text mapping logic will help Linux Xen parvirt guest boot - * as well. - */ - if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) - pgprot_val(forbidden) |= _PAGE_RW; - } + if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) + return _PAGE_RW; + return 0; +} +#else +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + return 0; +} #endif - prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); +static inline bool conflicts(pgprot_t prot, pgprotval_t val) +{ + return (pgprot_val(prot) & ~val) != pgprot_val(prot); +} - return prot; +static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, + unsigned long start, unsigned long end, + unsigned long pfn, const char *txt) +{ + static const char *lvltxt[] = { + [CPA_CONFLICT] = "conflict", + [CPA_PROTECT] = "protect", + [CPA_DETECT] = "detect", + }; + + if (warnlvl > cpa_warn_level || !conflicts(prot, val)) + return; + + pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", + lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), + (unsigned long long)val); +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, + unsigned long pfn, unsigned long npg, + int warnlvl) +{ + pgprotval_t forbidden, res; + unsigned long end; + + /* + * There is no point in checking RW/NX conflicts when the requested + * mapping is setting the page !PRESENT. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + return prot; + + /* Operate on the virtual address */ + end = start + npg * PAGE_SIZE - 1; + + res = protect_kernel_text(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); + forbidden = res; + + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; + + /* Check the PFN directly */ + res = protect_pci_bios(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); + forbidden |= res; + + res = protect_rodata(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); + forbidden |= res; + + return __pgprot(pgprot_val(prot) & ~forbidden); } /* @@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, */ pte_t *lookup_address(unsigned long address, unsigned int *level) { - return lookup_address_in_pgd(pgd_offset_k(address), address, level); + return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { - if (cpa->pgd) + if (cpa->pgd) return lookup_address_in_pgd(cpa->pgd + pgd_index(address), address, level); - return lookup_address(address, level); + return lookup_address(address, level); } /* @@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) return prot; } -static int -try_preserve_large_page(pte_t *kpte, unsigned long address, - struct cpa_data *cpa) +static int __should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; + unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; + pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, old_pte, *tmp; - pgprot_t old_prot, new_prot, req_prot; - int i, do_split = 1; enum pg_level level; - if (cpa->force_split) - return 1; - - spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) - goto out_unlock; + return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); + cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); + cpa_inc_1g_checked(); break; default: - do_split = -EINVAL; - goto out_unlock; + return -EINVAL; } psize = page_level_size(level); @@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Calculate the number of pages, which fit into this large * page starting at address: */ - nextpage_addr = (address + psize) & pmask; - numpages = (nextpage_addr - address) >> PAGE_SHIFT; + lpaddr = (address + psize) & pmask; + numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; @@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(req_prot) |= _PAGE_PSE; /* - * old_pfn points to the large page base pfn. So we need - * to add the offset of the virtual address: + * old_pfn points to the large page base pfn. So we need to add the + * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(req_prot, address, pfn); + /* + * Calculate the large page base address and the number of 4K pages + * in the large page + */ + lpaddr = address & pmask; + numpages = psize >> PAGE_SHIFT; /* - * We need to check the full range, whether - * static_protection() requires a different pgprot for one of - * the pages in the range we try to preserve: + * Sanity check that the existing mapping is correct versus the static + * protections. static_protections() guards against !PRESENT, so no + * extra conditional required here. */ - addr = address & pmask; - pfn = old_pfn; - for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(req_prot, addr, pfn); + chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, + CPA_CONFLICT); - if (pgprot_val(chk_prot) != pgprot_val(new_prot)) - goto out_unlock; + if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { + /* + * Split the large page and tell the split code to + * enforce static protections. + */ + cpa->force_static_prot = 1; + return 1; } /* - * If there are no changes, return. maxpages has been updated - * above: + * Optimization: If the requested pgprot is the same as the current + * pgprot, then the large page can be preserved and no updates are + * required independent of alignment and length of the requested + * range. The above already established that the current pgprot is + * correct, which in consequence makes the requested pgprot correct + * as well if it is the same. The static protection scan below will + * not come to a different conclusion. */ - if (pgprot_val(new_prot) == pgprot_val(old_prot)) { - do_split = 0; - goto out_unlock; + if (pgprot_val(req_prot) == pgprot_val(old_prot)) { + cpa_inc_lp_sameprot(level); + return 0; } /* - * We need to change the attributes. Check, whether we can - * change the large page in one go. We request a split, when - * the address is not aligned and the number of pages is - * smaller than the number of pages in the large page. Note - * that we limited the number of possible pages already to - * the number of pages in the large page. + * If the requested range does not cover the full page, split it up */ - if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { - /* - * The address is aligned and the number of pages - * covers the full page. - */ - new_pte = pfn_pte(old_pfn, new_prot); - __set_pmd_pte(kpte, address, new_pte); - cpa->flags |= CPA_FLUSHTLB; - do_split = 0; - } + if (address != lpaddr || cpa->numpages != numpages) + return 1; -out_unlock: + /* + * Check whether the requested pgprot is conflicting with a static + * protection requirement in the large page. + */ + new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, + CPA_DETECT); + + /* + * If there is a conflict, split the large page. + * + * There used to be a 4k wise evaluation trying really hard to + * preserve the large pages, but experimentation has shown, that this + * does not help at all. There might be corner cases which would + * preserve one large page occasionally, but it's really not worth the + * extra code and cycles for the common case. + */ + if (pgprot_val(req_prot) != pgprot_val(new_prot)) + return 1; + + /* All checks passed. Update the large page mapping. */ + new_pte = pfn_pte(old_pfn, new_prot); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + cpa_inc_lp_preserved(level); + return 0; +} + +static int should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + int do_split; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } +static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, + pgprot_t ref_prot, unsigned long address, + unsigned long size) +{ + unsigned int npg = PFN_DOWN(size); + pgprot_t prot; + + /* + * If should_split_large_page() discovered an inconsistent mapping, + * remove the invalid protection in the split mapping. + */ + if (!cpa->force_static_prot) + goto set; + + prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); + + if (pgprot_val(prot) == pgprot_val(ref_prot)) + goto set; + + /* + * If this is splitting a PMD, fix it up. PUD splits cannot be + * fixed trivially as that would require to rescan the newly + * installed PMD mappings after returning from split_large_page() + * so an eventual further split can allocate the necessary PTE + * pages. Warn for now and revisit it in case this actually + * happens. + */ + if (size == PAGE_SIZE) + ref_prot = prot; + else + pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); +set: + set_pte(pte, pfn_pte(pfn, ref_prot)); +} + static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { + unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); - unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; - pte_t *tmp; pgprot_t ref_prot; + pte_t *tmp; spin_lock(&pgd_lock); /* @@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); - ref_pfn = pmd_pfn(*(pmd_t *)kpte); + lpaddr = address & PMD_MASK; + lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - + lpaddr = address & PUD_MASK; + lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true @@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * Get the target pfn from the original entry: */ pfn = ref_pfn; - for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) + split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); @@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* - * Intel Atom errata AAH41 workaround. + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * Without this, we violate the TLB application note, that says: + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." * - * The real fix should be in hw or in a microcode update, but - * we also probabilistically try to reduce the window of having - * a large TLB mixed with 4K TLBs while instruction fetches are - * going on. + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. */ - __flush_tlb_all(); + flush_tlb_all(); spin_unlock(&pgd_lock); return 0; @@ -1247,7 +1494,9 @@ repeat: pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address, pfn); + cpa_inc_4k_install(); + new_prot = static_protections(new_prot, address, pfn, 1, + CPA_PROTECT); new_prot = pgprot_clear_protnone_bits(new_prot); @@ -1273,7 +1522,7 @@ repeat: * Check, whether we can keep the large page intact * and just change the pte: */ - do_split = try_preserve_large_page(kpte, address, cpa); + do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in @@ -1286,28 +1535,8 @@ repeat: * We have to split the large page: */ err = split_large_page(cpa, kpte, address); - if (!err) { - /* - * Do a global flush tlb after splitting the large page - * and before we do the actual change page attribute in the PTE. - * - * With out this, we violate the TLB application note, that says - * "The TLBs may contain both ordinary and large-page - * translations for a 4-KByte range of linear addresses. This - * may occur if software modifies the paging structures so that - * the page size used for the address range changes. If the two - * translations differ with respect to page frame or attributes - * (e.g., permissions), processor behavior is undefined and may - * be implementation-specific." - * - * We do this global tlb flush inside the cpa_lock, so that we - * don't allow any other cpu, with stale tlb entries change the - * page attribute in parallel, that also falls into the - * just split large page entry. - */ - flush_tlb_all(); + if (!err) goto repeat; - } return err; } @@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = !!pgprot2cachemode(mask_set); /* - * On success we use CLFLUSH, when the CPU supports it to - * avoid the WBINVD. If the CPU does not support it and in the - * error case we fall back to cpa_flush_all (which uses - * WBINVD): + * On error; flush everything to be sure. */ - if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(addr, numpages, cache, - cpa.flags, pages); - } else - cpa_flush_range(baddr, numpages, cache); - } else + if (ret) { cpa_flush_all(cache); + goto out; + } + + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(baddr, addr, numpages, cache, + cpa.flags, pages); + } else { + cpa_flush_range(baddr, numpages, cache); + } out: return ret; @@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 1); - else - cpa_flush_all(1); + cpa_flush_range(start, numpages, 1); ret = __change_page_attr_set_clr(&cpa, 1); @@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 0); - else - cpa_flush_all(0); + cpa_flush_range(start, numpages, 0); return ret; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e848a4811785..59274e2c1ac4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -115,6 +115,8 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) +#define MAX_UNSHARED_PTRS_PER_PGD \ + max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) @@ -181,6 +183,7 @@ static void pgd_dtor(pgd_t *pgd) * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table @@ -189,6 +192,7 @@ static void pgd_dtor(pgd_t *pgd) */ #define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) +#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { @@ -210,7 +214,9 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 +#define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 +#define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) @@ -269,7 +275,7 @@ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - *pgdp = native_make_pgd(0); + pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); @@ -428,8 +434,8 @@ static inline void _pgd_free(pgd_t *pgd) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; - pmd_t *pmds[PREALLOCATED_PMDS]; + pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; + pmd_t *pmds[MAX_PREALLOCATED_PMDS]; pgd = _pgd_alloc(); @@ -494,7 +500,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed && dirty) - *ptep = entry; + set_pte(ptep, entry); return changed; } @@ -509,7 +515,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { - *pmdp = entry; + set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -529,7 +535,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { - *pudp = entry; + set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, @@ -637,6 +643,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index c1fc1ae6b429..4fee5c3003ed 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -434,11 +434,42 @@ static void __init pti_clone_p4d(unsigned long addr) } /* - * Clone the CPU_ENTRY_AREA into the user space visible page table. + * Clone the CPU_ENTRY_AREA and associated data into the user space visible + * page table. */ static void __init pti_clone_user_shared(void) { + unsigned int cpu; + pti_clone_p4d(CPU_ENTRY_AREA_BASE); + + for_each_possible_cpu(cpu) { + /* + * The SYSCALL64 entry code needs to be able to find the + * thread stack and needs one word of scratch space in which + * to spill a register. All of this lives in the TSS, in + * the sp1 and sp2 slots. + * + * This is done for all possible CPUs during boot to ensure + * that it's propagated to all mms. If we were to add one of + * these mappings during CPU hotplug, we would need to take + * some measure to make sure that every mm that subsequently + * ran on that CPU would have the relevant PGD entry in its + * pagetables. The usual vmalloc_fault() mechanism would not + * work for page faults taken in entry_SYSCALL_64 before RSP + * is set up. + */ + + unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); + phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); + pte_t *target_pte; + + target_pte = pti_user_pagetable_walk_pte(va); + if (WARN_ON(!target_pte)) + return; + + *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); + } } #else /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e96b99eb800c..bddd6b3cee1d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> +#include <linux/ptrace.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -180,13 +181,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } +static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +{ + /* + * Check if the current (previous) task has access to the memory + * of the @tsk (next) task. If access is denied, make sure to + * issue a IBPB to stop user->user Spectre-v2 attacks. + * + * Note: __ptrace_may_access() returns 0 or -ERRNO. + */ + return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && + ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); +} + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -240,20 +257,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * We don't currently support having a real mm loaded without - * our cpu set in mm_cpumask(). We have all the bookkeeping - * in place to figure out whether we would need to flush - * if our cpu were cleared in mm_cpumask(), but we don't - * currently use it. + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -262,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * one process from doing Spectre-v2 attacks on another. * * As an optimization, flush indirect branches only when - * switching into processes that disable dumping. This - * protects high value processes like gpg, without having - * too high performance overhead. IBPB is *expensive*! - * - * This will not flush branches when switching into kernel - * threads. It will also not flush if we switch to idle - * thread and back to the same process. It will flush if we - * switch to a different non-dumpable process. + * switching into a processes that can't be ptrace by the + * current one (as in such case, attacker has much more + * convenient way how to tamper with the next process than + * branch buffer poisoning). */ - if (tsk && tsk->mm && - tsk->mm->context.ctx_id != last_ctx_id && - get_dumpable(tsk->mm) != SUID_DUMP_USER) + if (static_cpu_has(X86_FEATURE_USE_IBPB) && + ibpb_needed(tsk, last_ctx_id)) indirect_branch_prediction_barrier(); if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -308,46 +341,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - /* Make sure we write CR3 before loaded_mm. */ - barrier(); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - load_mm_cr4(next); - switch_ldt(real_prev, next); + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + + if (next != real_prev) { + load_mm_cr4(next); + switch_ldt(real_prev, next); + } } /* @@ -368,20 +403,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* - * There's a significant optimization that may be possible - * here. We have accurate enough TLB flush tracking that we - * don't need to maintain coherence of TLB per se when we're - * lazy. We do, however, need to maintain coherence of - * paging-structure caches. We could, in principle, leave our - * old mm loaded and only switch to init_mm when - * tlb_remove_page() happens. - */ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /* @@ -468,6 +490,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -528,17 +553,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long addr; - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; + unsigned long addr = f->start; - addr = f->start; while (addr < f->end) { __flush_tlb_one_user(addr); - addr += PAGE_SIZE; + addr += 1UL << f->stride_shift; } if (local) - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); - trace_tlb_flush(reason, nr_pages); + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ local_flush_tlb(); @@ -571,6 +595,11 @@ static void flush_tlb_func_remote(void *info) flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); } +static bool tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} + void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -606,8 +635,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func_remote, + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, + (void *)info, 1, GFP_ATOMIC, cpumask); } /* @@ -623,12 +667,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag) + unsigned long end, unsigned int stride_shift, + bool freed_tables) { int cpu; struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, + .stride_shift = stride_shift, + .freed_tables = freed_tables, }; cpu = get_cpu(); @@ -638,8 +685,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && - !(vmflag & VM_HUGETLB) && - ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { info.start = start; info.end = end; } else { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5559dcaddd5e..948656069cdd 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -356,7 +356,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) } else { struct pci_root_info *info; - info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) dev_err(&root->device->dev, "pci_bus %04x:%02x: ignored (out of memory)\n", diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 649bdde63e32..bfa50e65ef6c 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -93,7 +93,8 @@ static int __init early_root_info_init(void) vendor = id & 0xffff; device = (id>>16) & 0xffff; - if (vendor != PCI_VENDOR_ID_AMD) + if (vendor != PCI_VENDOR_ID_AMD && + vendor != PCI_VENDOR_ID_HYGON) continue; if (hb_probes[i].device == device) { @@ -390,7 +391,8 @@ static int __init pci_io_ecs_init(void) static int __init amd_postcore_init(void) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return 0; early_root_info_init(); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 13f4485ca388..30a5111ae5fd 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -629,17 +629,11 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (is_vmd(pdev->bus)) + if (is_vmd(pdev->bus) && pci_is_root_bus(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334a, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334b, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334c, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x334d, quirk_no_aersid); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, quirk_no_aersid); #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 034813d4ab1e..6cb6076223ba 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -115,7 +115,7 @@ static struct dentry *punit_dbg_file; static int punit_dbgfs_register(struct punit_device *punit_device) { - static struct dentry *dev_state; + struct dentry *dev_state; punit_dbg_file = debugfs_create_dir("punit_atom", NULL); if (!punit_dbg_file) @@ -143,8 +143,8 @@ static void punit_dbgfs_unregister(void) (kernel_ulong_t)&drv_data } static const struct x86_cpu_id intel_punit_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), {} }; diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 5fdacb322ceb..7476b3b097e1 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -26,12 +26,14 @@ static bool early_efi_keep; */ static __init int early_efi_map_fb(void) { - unsigned long base, size; + u64 base, size; if (!early_efi_keep) return 0; base = boot_params.screen_info.lfb_base; + if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)boot_params.screen_info.ext_lfb_base << 32; size = boot_params.screen_info.lfb_size; efi_fb = ioremap(base, size); @@ -46,9 +48,11 @@ early_initcall(early_efi_map_fb); */ static __ref void *early_efi_map(unsigned long start, unsigned long len) { - unsigned long base; + u64 base; base = boot_params.screen_info.lfb_base; + if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)boot_params.screen_info.ext_lfb_base << 32; if (efi_fb) return (efi_fb + start); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 05ca14222463..9959657127f4 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -85,10 +85,9 @@ pgd_t * __init efi_call_phys_prolog(void) void __init efi_call_phys_epilog(pgd_t *save_pgd) { + load_fixmap_gdt(0); load_cr3(save_pgd); __flush_tlb_all(); - - load_fixmap_gdt(0); } void __init efi_runtime_update_mappings(void) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ee5d08f25ce4..e8da7f492970 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -619,18 +619,16 @@ void __init efi_dump_pagetable(void) /* * Makes the calling thread switch to/from efi_mm context. Can be used - * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well - * as during efi runtime calls i.e current->active_mm == current_mm. - * We are not mm_dropping()/mm_grabbing() any mm, because we are not - * losing/creating any references. + * in a kernel thread and user context. Preemption needs to remain disabled + * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm + * can not change under us. + * It should be ensured that there are no concurent calls to this function. */ void efi_switch_mm(struct mm_struct *mm) { - task_lock(current); efi_scratch.prev_mm = current->active_mm; current->active_mm = mm; switch_mm(efi_scratch.prev_mm, mm, NULL); - task_unlock(current); } #ifdef CONFIG_EFI_MIXED diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 844d31cb8a0c..669babcaf245 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -16,6 +16,7 @@ #include <asm/efi.h> #include <asm/uv/uv.h> #include <asm/cpu_device_id.h> +#include <asm/reboot.h> #define EFI_MIN_RESERVE 5120 @@ -654,3 +655,80 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, } #endif + +/* + * If any access by any efi runtime service causes a page fault, then, + * 1. If it's efi_reset_system(), reboot through BIOS. + * 2. If any other efi runtime service, then + * a. Return error status to the efi caller process. + * b. Disable EFI Runtime Services forever and + * c. Freeze efi_rts_wq and schedule new process. + * + * @return: Returns, if the page fault is not handled. This function + * will never return if the page fault is handled successfully. + */ +void efi_recover_from_page_fault(unsigned long phys_addr) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* + * Make sure that an efi runtime service caused the page fault. + * "efi_mm" cannot be used to check if the page fault had occurred + * in the firmware context because efi=old_map doesn't use efi_pgd. + */ + if (efi_rts_work.efi_rts_id == NONE) + return; + + /* + * Address range 0x0000 - 0x0fff is always mapped in the efi_pgd, so + * page faulting on these addresses isn't expected. + */ + if (phys_addr >= 0x0000 && phys_addr <= 0x0fff) + return; + + /* + * Print stack trace as it might be useful to know which EFI Runtime + * Service is buggy. + */ + WARN(1, FW_BUG "Page fault caused by firmware at PA: 0x%lx\n", + phys_addr); + + /* + * Buggy efi_reset_system() is handled differently from other EFI + * Runtime Services as it doesn't use efi_rts_wq. Although, + * native_machine_emergency_restart() says that machine_real_restart() + * could fail, it's better not to compilcate this fault handler + * because this case occurs *very* rarely and hence could be improved + * on a need by basis. + */ + if (efi_rts_work.efi_rts_id == RESET_SYSTEM) { + pr_info("efi_reset_system() buggy! Reboot through BIOS\n"); + machine_real_restart(MRR_BIOS); + return; + } + + /* + * Before calling EFI Runtime Service, the kernel has switched the + * calling process to efi_mm. Hence, switch back to task_mm. + */ + arch_efi_call_virt_teardown(); + + /* Signal error status to the efi caller process */ + efi_rts_work.status = EFI_ABORTED; + complete(&efi_rts_work.efi_rts_comp); + + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + pr_info("Froze efi_rts_wq and disabled EFI Runtime Services\n"); + + /* + * Call schedule() in an infinite loop, so that any spurious wake ups + * will never run efi_rts_wq again. + */ + for (;;) { + set_current_state(TASK_IDLE); + schedule(); + } + + return; +} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c index 4392c15ed9e0..dbfc5cf2aa93 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c @@ -10,7 +10,7 @@ * of the License. */ -#include <linux/gpio.h> +#include <linux/gpio/machine.h> #include <linux/platform_device.h> #include <linux/regulator/machine.h> #include <linux/regulator/fixed.h> @@ -43,7 +43,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = { * real voltage and signaling are still 1.8V. */ .microvolts = 2000000, /* 1.8V */ - .gpio = -EINVAL, .startup_delay = 250 * 1000, /* 250ms */ .enable_high = 1, /* active high */ .enabled_at_boot = 0, /* disabled at boot */ @@ -58,11 +57,23 @@ static struct platform_device bcm43xx_vmmc_regulator = { }, }; +static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = { + .dev_id = "reg-fixed-voltage.0", + .table = { + GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW), + {} + }, +}; + static int __init bcm43xx_regulator_register(void) { + struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table; + struct gpiod_lookup *lookup = table->table; int ret; - bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); + lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME); + gpiod_add_lookup_table(table); + ret = platform_device_register(&bcm43xx_vmmc_regulator); if (ret) { pr_err("%s: vmmc regulator register failed\n", __func__); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..31dce781364c 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -68,7 +68,7 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = { { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } static const struct x86_cpu_id bt_sfi_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data), + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data), {} }; diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c index a2b4efddd61a..8e7ddd7e313a 100644 --- a/arch/x86/platform/olpc/olpc-xo1-rtc.c +++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c @@ -16,6 +16,7 @@ #include <asm/msr.h> #include <asm/olpc.h> +#include <asm/x86_init.h> static void rtc_wake_on(struct device *dev) { @@ -75,6 +76,8 @@ static int __init xo1_rtc_init(void) if (r) return r; + x86_platform.legacy.rtc = 0; + device_init_wakeup(&xo1_rtc_device.dev, 1); return 0; } diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c index fd39301f25ac..7e56fc74093c 100644 --- a/arch/x86/platform/ts5500/ts5500.c +++ b/arch/x86/platform/ts5500/ts5500.c @@ -24,7 +24,6 @@ #include <linux/kernel.h> #include <linux/leds.h> #include <linux/init.h> -#include <linux/platform_data/gpio-ts5500.h> #include <linux/platform_data/max197.h> #include <linux/platform_device.h> #include <linux/slab.h> diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index a4701389562c..37923d715741 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -7,4 +7,4 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o -obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c new file mode 100644 index 000000000000..bcddf09b5aa3 --- /dev/null +++ b/arch/x86/power/hibernate.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hibernation support for x86 + * + * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> + +#include <asm/e820/api.h> +#include <asm/init.h> +#include <asm/proto.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mtrr.h> +#include <asm/sections.h> +#include <asm/suspend.h> +#include <asm/tlbflush.h> + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address __visible; +unsigned long jump_address_phys; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3 __visible; +unsigned long temp_pgt __visible; +unsigned long relocated_restore_code __visible; + +/** + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn; + unsigned long nosave_end_pfn; + + nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + + return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; +} + + +#define MD5_DIGEST_SIZE 16 + +struct restore_data_record { + unsigned long jump_address; + unsigned long jump_address_phys; + unsigned long cr3; + unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; +}; + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 table + * + * @table: the e820 table to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820_table *table, void *buf) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int size; + int ret = 0; + + tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(tfm)) + return -ENOMEM; + + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto free_tfm; + } + + desc->tfm = tfm; + desc->flags = 0; + + size = offsetof(struct e820_table, entries) + + sizeof(struct e820_entry) * table->nr_entries; + + if (crypto_shash_digest(desc, (u8 *)table, size, buf)) + ret = -EINVAL; + + kzfree(desc); + +free_tfm: + crypto_free_shash(tfm); + return ret; +} + +static int hibernation_e820_save(void *buf) +{ + return get_e820_md5(e820_table_firmware, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_table_firmware, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static int hibernation_e820_save(void *buf) +{ + return 0; +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif + +#ifdef CONFIG_X86_64 +#define RESTORE_MAGIC 0x23456789ABCDEF01UL +#else +#define RESTORE_MAGIC 0x12345678UL +#endif + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->magic = RESTORE_MAGIC; + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); + + /* + * The restore code fixes up CR3 and CR4 in the following sequence: + * + * [in hibernation asm] + * 1. CR3 <= temporary page tables + * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) + * 3. CR3 <= rdr->cr3 + * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) + * [in restore_processor_state()] + * 5. CR4 <= saved CR4 + * 6. CR3 <= saved CR3 + * + * Our mmu_cr4_features has CR4.PCIDE=0, and toggling + * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so + * rdr->cr3 needs to point to valid page tables but must not + * have any of the PCID bits set. + */ + rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; + + return hibernation_e820_save(rdr->e820_digest); +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + restore_jump_address = rdr->jump_address; + jump_address_phys = rdr->jump_address_phys; + restore_cr3 = rdr->cr3; + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; +} + +int relocate_restore_code(void) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + relocated_restore_code = get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + + /* Make the page containing the relocated code executable */ + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); + if (pud_large(*pud)) { + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; + } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: + __flush_tlb_all(); + return 0; +} diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index afc4ed7b1578..15695e30f982 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -14,9 +14,7 @@ #include <asm/pgtable.h> #include <asm/mmzone.h> #include <asm/sections.h> - -/* Defined in hibernate_asm_32.S */ -extern int restore_image(void); +#include <asm/suspend.h> /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -145,6 +143,32 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir) #endif } +static int set_up_temporary_text_mapping(pgd_t *pgd_base) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(restore_jump_address); + + pmd = resume_one_md_table_init(pgd); + if (!pmd) + return -ENOMEM; + + if (boot_cpu_has(X86_FEATURE_PSE)) { + set_pmd(pmd + pmd_index(restore_jump_address), + __pmd((jump_address_phys & PMD_MASK) | pgprot_val(PAGE_KERNEL_LARGE_EXEC))); + } else { + pte = resume_one_page_table_init(pmd); + if (!pte) + return -ENOMEM; + set_pte(pte + pte_index(restore_jump_address), + __pte((jump_address_phys & PAGE_MASK) | pgprot_val(PAGE_KERNEL_EXEC))); + } + + return 0; +} + asmlinkage int swsusp_arch_resume(void) { int error; @@ -154,22 +178,22 @@ asmlinkage int swsusp_arch_resume(void) return -ENOMEM; resume_init_first_level_page_table(resume_pg_dir); + + error = set_up_temporary_text_mapping(resume_pg_dir); + if (error) + return error; + error = resume_physical_mapping_init(resume_pg_dir); if (error) return error; + temp_pgt = __pa(resume_pg_dir); + + error = relocate_restore_code(); + if (error) + return error; + /* We have got enough memory and from now on we cannot recover */ restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f8e3b668d20b..239f424ccb29 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -26,26 +26,6 @@ #include <asm/suspend.h> #include <asm/tlbflush.h> -/* Defined in hibernate_asm_64.S */ -extern asmlinkage __visible int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address __visible; -unsigned long jump_address_phys; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3 __visible; - -unsigned long temp_level4_pgt __visible; - -unsigned long relocated_restore_code __visible; - static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; @@ -141,46 +121,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = __pa(pgd); - return 0; -} - -static int relocate_restore_code(void) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - relocated_restore_code = get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); - - /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3_pa()) + - pgd_index(relocated_restore_code); - p4d = p4d_offset(pgd, relocated_restore_code); - if (p4d_large(*p4d)) { - set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); - goto out; - } - pud = pud_offset(p4d, relocated_restore_code); - if (pud_large(*pud)) { - set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - goto out; - } - pmd = pmd_offset(pud, relocated_restore_code); - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - goto out; - } - pte = pte_offset_kernel(pmd, relocated_restore_code); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); -out: - __flush_tlb_all(); + temp_pgt = __pa(pgd); return 0; } @@ -200,166 +141,3 @@ asmlinkage int swsusp_arch_resume(void) restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -#define MD5_DIGEST_SIZE 16 - -struct restore_data_record { - unsigned long jump_address; - unsigned long jump_address_phys; - unsigned long cr3; - unsigned long magic; - u8 e820_digest[MD5_DIGEST_SIZE]; -}; - -#define RESTORE_MAGIC 0x23456789ABCDEF01UL - -#if IS_BUILTIN(CONFIG_CRYPTO_MD5) -/** - * get_e820_md5 - calculate md5 according to given e820 table - * - * @table: the e820 table to be calculated - * @buf: the md5 result to be stored to - */ -static int get_e820_md5(struct e820_table *table, void *buf) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int size; - int ret = 0; - - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto free_tfm; - } - - desc->tfm = tfm; - desc->flags = 0; - - size = offsetof(struct e820_table, entries) + - sizeof(struct e820_entry) * table->nr_entries; - - if (crypto_shash_digest(desc, (u8 *)table, size, buf)) - ret = -EINVAL; - - kzfree(desc); - -free_tfm: - crypto_free_shash(tfm); - return ret; -} - -static void hibernation_e820_save(void *buf) -{ - get_e820_md5(e820_table_firmware, buf); -} - -static bool hibernation_e820_mismatch(void *buf) -{ - int ret; - u8 result[MD5_DIGEST_SIZE]; - - memset(result, 0, MD5_DIGEST_SIZE); - /* If there is no digest in suspend kernel, let it go. */ - if (!memcmp(result, buf, MD5_DIGEST_SIZE)) - return false; - - ret = get_e820_md5(e820_table_firmware, result); - if (ret) - return true; - - return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; -} -#else -static void hibernation_e820_save(void *buf) -{ -} - -static bool hibernation_e820_mismatch(void *buf) -{ - /* If md5 is not builtin for restore kernel, let it go. */ - return false; -} -#endif - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = (unsigned long)restore_registers; - rdr->jump_address_phys = __pa_symbol(restore_registers); - - /* - * The restore code fixes up CR3 and CR4 in the following sequence: - * - * [in hibernation asm] - * 1. CR3 <= temporary page tables - * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) - * 3. CR3 <= rdr->cr3 - * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) - * [in restore_processor_state()] - * 5. CR4 <= saved CR4 - * 6. CR3 <= saved CR3 - * - * Our mmu_cr4_features has CR4.PCIDE=0, and toggling - * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so - * rdr->cr3 needs to point to valid page tables but must not - * have any of the PCID bits set. - */ - rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; - - rdr->magic = RESTORE_MAGIC; - - hibernation_e820_save(rdr->e820_digest); - - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - jump_address_phys = rdr->jump_address_phys; - restore_cr3 = rdr->cr3; - - if (rdr->magic != RESTORE_MAGIC) { - pr_crit("Unrecognized hibernate image header format!\n"); - return -EINVAL; - } - - if (hibernation_e820_mismatch(rdr->e820_digest)) { - pr_crit("Hibernate inconsistent memory map detected!\n"); - return -ENODEV; - } - - return 0; -} diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 6e56815e13a0..6fe383002125 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -12,6 +12,7 @@ #include <asm/page_types.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/frame.h> .text @@ -24,13 +25,30 @@ ENTRY(swsusp_arch_suspend) pushfl popl saved_context_eflags + /* save cr3 */ + movl %cr3, %eax + movl %eax, restore_cr3 + + FRAME_BEGIN call swsusp_save + FRAME_END ret +ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) + /* prepare to jump to the image kernel */ + movl restore_jump_address, %ebx + movl restore_cr3, %ebp + movl mmu_cr4_features, %ecx - movl resume_pg_dir, %eax - subl $__PAGE_OFFSET, %eax + + /* jump to relocated restore code */ + movl relocated_restore_code, %eax + jmpl *%eax + +/* code below has been relocated to a safe page */ +ENTRY(core_restore_code) + movl temp_pgt, %eax movl %eax, %cr3 jecxz 1f # cr4 Pentium and higher, skip if zero @@ -49,7 +67,7 @@ copy_loop: movl pbe_address(%edx), %esi movl pbe_orig_address(%edx), %edi - movl $1024, %ecx + movl $(PAGE_SIZE >> 2), %ecx rep movsl @@ -58,10 +76,13 @@ copy_loop: .p2align 4,,7 done: + jmpl *%ebx + + /* code below belongs to the image kernel */ + .align PAGE_SIZE +ENTRY(restore_registers) /* go back to the original page tables */ - movl $swapper_pg_dir, %eax - subl $__PAGE_OFFSET, %eax - movl %eax, %cr3 + movl %ebp, %cr3 movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on @@ -82,4 +103,8 @@ done: xorl %eax, %eax + /* tell the hibernation core that we've just restored the memory */ + movl %eax, in_suspend + ret +ENDPROC(restore_registers) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index fd369a6e9ff8..3008baa2fa95 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -59,7 +59,7 @@ ENTRY(restore_image) movq restore_cr3(%rip), %r9 /* prepare to switch to temporary page tables */ - movq temp_level4_pgt(%rip), %rax + movq temp_pgt(%rip), %rax movq mmu_cr4_features(%rip), %rbx /* prepare to copy image data to their original locations */ diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 3a6c8ebc8032..0b08067c45f3 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -196,6 +196,7 @@ static const char *rel_type(unsigned type) #if ELF_BITS == 64 REL_TYPE(R_X86_64_NONE), REL_TYPE(R_X86_64_64), + REL_TYPE(R_X86_64_PC64), REL_TYPE(R_X86_64_PC32), REL_TYPE(R_X86_64_GOT32), REL_TYPE(R_X86_64_PLT32), @@ -782,6 +783,15 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, add_reloc(&relocs32neg, offset); break; + case R_X86_64_PC64: + /* + * Only used by jump labels + */ + if (is_percpu_sym(sym, symname)) + die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", + symname); + break; + case R_X86_64_32: case R_X86_64_32S: case R_X86_64_64: diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 548197212a45..413f3519d9a1 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -116,8 +116,7 @@ do { \ #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ - -#define R_X86_64_NUM 16 +#define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * This is used to ensure we don't load something for the wrong architecture. diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c1f98f32c45f..1ef391aa184d 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -18,6 +18,7 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN + select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -68,7 +69,6 @@ config XEN_SAVE_RESTORE config XEN_DEBUG_FS bool "Enable Xen debug and tuning parameters in debugfs" depends on XEN && DEBUG_FS - default n help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index d83cb5478f54..dd2550d33b38 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,25 +12,46 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_enlighten_pv.o := $(nostackp) -CFLAGS_mmu_pv.o := $(nostackp) +CFLAGS_mmu_pv.o := $(nostackp) -obj-y := enlighten.o multicalls.o mmu.o irq.o \ - time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o +obj-y += enlighten.o +obj-y += mmu.o +obj-y += time.o +obj-y += grant-table.o +obj-y += suspend.o -obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o -obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \ - p2m.o enlighten_pv.o mmu_pv.o -obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o +obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o +obj-$(CONFIG_XEN_PVHVM) += mmu_hvm.o +obj-$(CONFIG_XEN_PVHVM) += suspend_hvm.o +obj-$(CONFIG_XEN_PVHVM) += platform-pci-unplug.o -obj-$(CONFIG_EVENT_TRACING) += trace.o +obj-$(CONFIG_XEN_PV) += setup.o +obj-$(CONFIG_XEN_PV) += apic.o +obj-$(CONFIG_XEN_PV) += pmu.o +obj-$(CONFIG_XEN_PV) += suspend_pv.o +obj-$(CONFIG_XEN_PV) += p2m.o +obj-$(CONFIG_XEN_PV) += enlighten_pv.o +obj-$(CONFIG_XEN_PV) += mmu_pv.o +obj-$(CONFIG_XEN_PV) += irq.o +obj-$(CONFIG_XEN_PV) += multicalls.o +obj-$(CONFIG_XEN_PV) += xen-asm.o +obj-$(CONFIG_XEN_PV) += xen-asm_$(BITS).o + +obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o +obj-$(CONFIG_XEN_PVH) += xen-pvh.o + +obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o + obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o + obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o + obj-$(CONFIG_XEN_DOM0) += vga.o + obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o + obj-$(CONFIG_XEN_EFI) += efi.o -obj-$(CONFIG_XEN_PVH) += xen-pvh.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 1804b27f9632..1fbb629a9d78 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Oracle Co., Daniel Kiper - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bitops.h> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2eeddd814653..67b2f31a1265 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG #include <linux/bootmem.h> #endif @@ -5,6 +7,7 @@ #include <linux/kexec.h> #include <linux/slab.h> +#include <xen/xen.h> #include <xen/features.h> #include <xen/page.h> #include <xen/interface/memory.h> diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 19c1ff542387..0e75642d42a3 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #include <linux/acpi.h> #include <linux/cpu.h> #include <linux/kexec.h> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 52a7c3faee0c..ec7a4209f310 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -995,11 +995,14 @@ void __init xen_setup_vcpu_info_placement(void) * percpu area for all cpus, so make use of it. */ if (xen_have_vcpu_info_placement) { - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); - pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_ops.irq.restore_fl = + __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_ops.irq.irq_disable = + __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_ops.irq.irq_enable = + __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_ops.mmu.read_cr2 = xen_read_cr2_direct; } } @@ -1174,14 +1177,14 @@ static void __init xen_boot_params_init_edd(void) */ static void __init xen_setup_gdt(int cpu) { - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; - pv_cpu_ops.load_gdt = xen_load_gdt_boot; + pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; + pv_ops.cpu.load_gdt = xen_load_gdt_boot; setup_stack_canary_segment(cpu); switch_to_new_gdt(cpu); - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; - pv_cpu_ops.load_gdt = xen_load_gdt; + pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; + pv_ops.cpu.load_gdt = xen_load_gdt; } static void __init xen_dom0_set_legacy_features(void) @@ -1206,8 +1209,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops.patch = paravirt_patch_default; - pv_cpu_ops = xen_cpu_ops; + pv_ops.init.patch = paravirt_patch_default; + pv_ops.cpu = xen_cpu_ops; xen_init_irq_ops(); /* @@ -1276,8 +1279,10 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { - pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; - pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + pv_ops.mmu.ptep_modify_prot_start = + xen_ptep_modify_prot_start; + pv_ops.mmu.ptep_modify_prot_commit = + xen_ptep_modify_prot_commit; } machine_ops = xen_machine_ops; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index c85d1a88f476..02e3ab7ff242 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -11,6 +11,7 @@ #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/interface/memory.h> #include <xen/interface/hvm/start_info.h> @@ -75,7 +76,7 @@ static void __init init_pvh_bootparams(void) * Version 2.12 supports Xen entry point but we will use default x86/PC * environment (i.e. hardware_subarch 0). */ - pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.version = (2 << 8) | 12; pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ x86_init.acpi.get_root_pointer = pvh_get_root_pointer; diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 92ccc718152d..ecb0d5450334 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT /****************************************************************************** * grant_table.c * x86 specific part @@ -8,30 +9,6 @@ * Copyright (c) 2004-2005, K A Fraser * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> * VA Linux Systems Japan. Split out x86 specific part. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. */ #include <linux/sched.h> diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 7515a19fd324..850c93f346c7 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -128,6 +128,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { void __init xen_init_irq_ops(void) { - pv_irq_ops = xen_irq_ops; + pv_ops.irq = xen_irq_ops; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 96fc2f0fdbfe..60e9c37fd79f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #include <linux/pfn.h> #include <asm/xen/page.h> #include <asm/xen/hypercall.h> @@ -6,12 +8,6 @@ #include "multicalls.h" #include "mmu.h" -/* - * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and balloon lists. - */ -DEFINE_SPINLOCK(xen_reservation_lock); - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); @@ -42,186 +38,6 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) } EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); -static noinline void xen_flush_tlb_all(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_ALL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -#define REMAP_BATCH_SIZE 16 - -struct remap_data { - xen_pfn_t *pfn; - bool contiguous; - bool no_translate; - pgprot_t prot; - struct mmu_update *mmu_update; -}; - -static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) -{ - struct remap_data *rmd = data; - pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot)); - - /* - * If we have a contiguous range, just update the pfn itself, - * else update pointer to be "next pfn". - */ - if (rmd->contiguous) - (*rmd->pfn)++; - else - rmd->pfn++; - - rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; - rmd->mmu_update->ptr |= rmd->no_translate ? - MMU_PT_UPDATE_NO_TRANSLATE : - MMU_NORMAL_PT_UPDATE; - rmd->mmu_update->val = pte_val_ma(pte); - rmd->mmu_update++; - - return 0; -} - -static int do_remap_pfn(struct vm_area_struct *vma, - unsigned long addr, - xen_pfn_t *pfn, int nr, - int *err_ptr, pgprot_t prot, - unsigned int domid, - bool no_translate, - struct page **pages) -{ - int err = 0; - struct remap_data rmd; - struct mmu_update mmu_update[REMAP_BATCH_SIZE]; - unsigned long range; - int mapped = 0; - - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); - - rmd.pfn = pfn; - rmd.prot = prot; - /* - * We use the err_ptr to indicate if there we are doing a contiguous - * mapping or a discontigious mapping. - */ - rmd.contiguous = !err_ptr; - rmd.no_translate = no_translate; - - while (nr) { - int index = 0; - int done = 0; - int batch = min(REMAP_BATCH_SIZE, nr); - int batch_left = batch; - range = (unsigned long)batch << PAGE_SHIFT; - - rmd.mmu_update = mmu_update; - err = apply_to_page_range(vma->vm_mm, addr, range, - remap_area_pfn_pte_fn, &rmd); - if (err) - goto out; - - /* We record the error for each page that gives an error, but - * continue mapping until the whole set is done */ - do { - int i; - - err = HYPERVISOR_mmu_update(&mmu_update[index], - batch_left, &done, domid); - - /* - * @err_ptr may be the same buffer as @gfn, so - * only clear it after each chunk of @gfn is - * used. - */ - if (err_ptr) { - for (i = index; i < index + done; i++) - err_ptr[i] = 0; - } - if (err < 0) { - if (!err_ptr) - goto out; - err_ptr[i] = err; - done++; /* Skip failed frame. */ - } else - mapped += done; - batch_left -= done; - index += done; - } while (batch_left); - - nr -= batch; - addr += range; - if (err_ptr) - err_ptr += batch; - cond_resched(); - } -out: - - xen_flush_tlb_all(); - - return err < 0 ? err : mapped; -} - -int xen_remap_domain_gfn_range(struct vm_area_struct *vma, - unsigned long addr, - xen_pfn_t gfn, int nr, - pgprot_t prot, unsigned domid, - struct page **pages) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EOPNOTSUPP; - - return do_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false, - pages); -} -EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); - -int xen_remap_domain_gfn_array(struct vm_area_struct *vma, - unsigned long addr, - xen_pfn_t *gfn, int nr, - int *err_ptr, pgprot_t prot, - unsigned domid, struct page **pages) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, - prot, domid, pages); - - /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, - * and the consequences later is quite hard to detect what the actual - * cause of "wrong memory was mapped in". - */ - BUG_ON(err_ptr == NULL); - return do_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid, - false, pages); -} -EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); - -int xen_remap_domain_mfn_array(struct vm_area_struct *vma, - unsigned long addr, - xen_pfn_t *mfn, int nr, - int *err_ptr, pgprot_t prot, - unsigned int domid, struct page **pages) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -EOPNOTSUPP; - - return do_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, - true, pages); -} -EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); - /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages) diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index dd2ad82eee80..57409373750f 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -73,7 +73,7 @@ static int is_pagetable_dying_supported(void) void __init xen_hvm_init_mmu_ops(void) { if (is_pagetable_dying_supported()) - pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; + pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap; #ifdef CONFIG_PROC_VMCORE WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); #endif diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..70ea598a37d2 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + /* * Xen mmu operations * @@ -99,6 +101,12 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; #endif /* CONFIG_X86_64 */ /* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and balloon lists. + */ +static DEFINE_SPINLOCK(xen_reservation_lock); + +/* * Note about cr3 (pagetable base) values: * * xen_cr3 contains the current logical cr3 value; it contains the @@ -1907,7 +1915,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1960,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, @@ -2205,7 +2217,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) set_page_prot(initial_page_table, PAGE_KERNEL); set_page_prot(initial_kernel_pmd, PAGE_KERNEL); - pv_mmu_ops.write_cr3 = &xen_write_cr3; + pv_ops.mmu.write_cr3 = &xen_write_cr3; } /* @@ -2354,27 +2366,27 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void __init xen_post_allocator_init(void) { - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; + pv_ops.mmu.set_pte = xen_set_pte; + pv_ops.mmu.set_pmd = xen_set_pmd; + pv_ops.mmu.set_pud = xen_set_pud; #ifdef CONFIG_X86_64 - pv_mmu_ops.set_p4d = xen_set_p4d; + pv_ops.mmu.set_p4d = xen_set_p4d; #endif /* This will work as long as patching hasn't happened yet (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; + pv_ops.mmu.alloc_pte = xen_alloc_pte; + pv_ops.mmu.alloc_pmd = xen_alloc_pmd; + pv_ops.mmu.release_pte = xen_release_pte; + pv_ops.mmu.release_pmd = xen_release_pmd; #ifdef CONFIG_X86_64 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; + pv_ops.mmu.alloc_pud = xen_alloc_pud; + pv_ops.mmu.release_pud = xen_release_pud; #endif - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); + pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte); #ifdef CONFIG_X86_64 - pv_mmu_ops.write_cr3 = &xen_write_cr3; + pv_ops.mmu.write_cr3 = &xen_write_cr3; #endif } @@ -2462,7 +2474,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_mmu_ops = xen_mmu_ops; + pv_ops.mmu = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); } @@ -2662,6 +2674,138 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) } EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); +static noinline void xen_flush_tlb_all(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +#define REMAP_BATCH_SIZE 16 + +struct remap_data { + xen_pfn_t *pfn; + bool contiguous; + bool no_translate; + pgprot_t prot; + struct mmu_update *mmu_update; +}; + +static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + struct remap_data *rmd = data; + pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot)); + + /* + * If we have a contiguous range, just update the pfn itself, + * else update pointer to be "next pfn". + */ + if (rmd->contiguous) + (*rmd->pfn)++; + else + rmd->pfn++; + + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; + rmd->mmu_update->ptr |= rmd->no_translate ? + MMU_PT_UPDATE_NO_TRANSLATE : + MMU_NORMAL_PT_UPDATE; + rmd->mmu_update->val = pte_val_ma(pte); + rmd->mmu_update++; + + return 0; +} + +int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, + xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, + unsigned int domid, bool no_translate, struct page **pages) +{ + int err = 0; + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + unsigned long range; + int mapped = 0; + + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + + rmd.pfn = pfn; + rmd.prot = prot; + /* + * We use the err_ptr to indicate if there we are doing a contiguous + * mapping or a discontigious mapping. + */ + rmd.contiguous = !err_ptr; + rmd.no_translate = no_translate; + + while (nr) { + int index = 0; + int done = 0; + int batch = min(REMAP_BATCH_SIZE, nr); + int batch_left = batch; + + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_area_pfn_pte_fn, &rmd); + if (err) + goto out; + + /* + * We record the error for each page that gives an error, but + * continue mapping until the whole set is done + */ + do { + int i; + + err = HYPERVISOR_mmu_update(&mmu_update[index], + batch_left, &done, domid); + + /* + * @err_ptr may be the same buffer as @gfn, so + * only clear it after each chunk of @gfn is + * used. + */ + if (err_ptr) { + for (i = index; i < index + done; i++) + err_ptr[i] = 0; + } + if (err < 0) { + if (!err_ptr) + goto out; + err_ptr[i] = err; + done++; /* Skip failed frame. */ + } else + mapped += done; + batch_left -= done; + index += done; + } while (batch_left); + + nr -= batch; + addr += range; + if (err_ptr) + err_ptr += batch; + cond_resched(); + } +out: + + xen_flush_tlb_all(); + + return err < 0 ? err : mapped; +} +EXPORT_SYMBOL_GPL(xen_remap_pfn); + #ifdef CONFIG_KEXEC_CORE phys_addr_t paddr_vmcoreinfo_note(void) { diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 159a897151d6..d6d74efd8912 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + /* * Xen leaves the responsibility for maintaining p2m mappings to the * guests themselves, but it must also access and update the p2m array diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 37c6056a7bba..33293ce01d8d 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + /* Glue code to lib/swiotlb-xen.c */ #include <linux/dma-mapping.h> diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 33a783c77d96..66ab96a4e2b3 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -1,28 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 + /****************************************************************************** * platform-pci-unplug.c * * Xen platform PCI device driver * Copyright (c) 2010, Citrix - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * */ #include <linux/init.h> #include <linux/io.h> #include <linux/export.h> +#include <xen/xen.h> #include <xen/platform_pci.h> #include "xen-ops.h" @@ -30,7 +19,6 @@ #define XEN_PLATFORM_ERR_PROTOCOL -2 #define XEN_PLATFORM_ERR_BLACKLIST -3 -#ifdef CONFIG_XEN_PVHVM /* store the value of xen_emul_unplug after the unplug is done */ static int xen_platform_pci_unplug; static int xen_emul_unplug; @@ -214,4 +202,3 @@ static int __init parse_xen_emul_unplug(char *arg) return 0; } early_param("xen_emul_unplug", parse_xen_emul_unplug); -#endif diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 7d00d4ad44d4..e13b0b49fcdf 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -3,6 +3,7 @@ #include <linux/interrupt.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> @@ -90,6 +91,12 @@ static void xen_pmu_arch_init(void) k7_counters_mirrored = 0; break; } + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; } else { uint32_t eax, ebx, ecx, edx; @@ -285,7 +292,7 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { if (is_amd_pmu_msr(msr)) { if (!xen_amd_pmu_emulate(msr, val, 1)) *val = native_read_msr_safe(msr, err); @@ -308,7 +315,7 @@ bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) { uint64_t val = ((uint64_t)high << 32) | low; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { if (is_amd_pmu_msr(msr)) { if (!xen_amd_pmu_emulate(msr, &val, 0)) *err = native_write_msr_safe(msr, low, high); @@ -379,7 +386,7 @@ static unsigned long long xen_intel_read_pmc(int counter) unsigned long long xen_read_pmc(int counter) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return xen_amd_read_pmc(counter); else return xen_intel_read_pmc(counter); @@ -478,7 +485,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index e3b18ad49889..145506f9fdbe 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/nmi.h> #include <linux/cpuhotplug.h> +#include <linux/stackprotector.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -88,6 +89,7 @@ static void cpu_bringup(void) asmlinkage __visible void cpu_bringup_and_idle(void) { cpu_bringup(); + boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 973f10e05211..23f6793af88a 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -141,11 +141,12 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); __pv_init_lock_hash(); - pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_lock_ops.wait = xen_qlock_wait; - pv_lock_ops.kick = xen_qlock_kick; - pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = + PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = xen_qlock_wait; + pv_ops.lock.kick = xen_qlock_kick; + pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } static __init int xen_parse_nopvspin(char *arg) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c84f1e039d84..72bf446c3fee 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -513,7 +513,7 @@ static void __init xen_time_init(void) void __init xen_init_time_ops(void) { xen_sched_clock_offset = xen_clocksource_read(); - pv_time_ops = xen_time_ops; + pv_ops.time = xen_time_ops; x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; @@ -555,7 +555,7 @@ void __init xen_hvm_init_time_ops(void) } xen_sched_clock_offset = xen_clocksource_read(); - pv_time_ops = xen_time_ops; + pv_ops.time = xen_time_ops; x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h index 861fedfe5230..873c54c488fe 100644 --- a/arch/x86/xen/vdso.h +++ b/arch/x86/xen/vdso.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + /* Bit used for the pseudo-hwcap for non-negative segments. We use bit 1 to avoid bugs in some versions of glibc when bit 0 is used; the choice is otherwise arbitrary. */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 417b339e5c8e..bb1c2da0381d 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -91,13 +91,15 @@ ENTRY(xen_iret) ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back + * still with the kernel gs, so we can easily switch back. + * + * tss.sp2 is scratch space. */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS - pushq PER_CPU_VAR(rsp_scratch) + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) pushq %r11 pushq $__USER_CS pushq %rcx diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S index ca2d3b2bf2af..b0e471506cd8 100644 --- a/arch/x86/xen/xen-pvh.S +++ b/arch/x86/xen/xen-pvh.S @@ -1,18 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + /* * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program. If not, see <http://www.gnu.org/licenses/>. */ .code32 |