diff options
Diffstat (limited to 'arch/x86')
413 files changed, 12709 insertions, 8802 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index eb3abf8ac44e..586b786b3edf 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -7,6 +7,9 @@ obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support obj-$(CONFIG_XEN) += xen/ +# Hyper-V paravirtualization support +obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ + # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..cc98d5a294ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,29 +9,55 @@ config 64BIT config X86_32 def_bool y depends on !64BIT + # Options that are inherently 32-bit kernel only: + select ARCH_WANT_IPC_PARSE_VERSION + select CLKSRC_I8253 + select CLONE_BACKWARDS + select HAVE_AOUT + select HAVE_GENERIC_DMA_COHERENT + select MODULES_USE_ELF_REL + select OLD_SIGACTION config X86_64 def_bool y depends on 64BIT + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA + select X86_DEV_DMA_OPS -### Arch settings +# +# Arch settings +# +# ( Note that options that are marked 'if X86_64' could in principle be +# ported to 32-bit as well. ) +# config X86 def_bool y + # + # Note: keep this list sorted alphabetically + # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK - select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if X86_64 select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN + select ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -39,23 +65,17 @@ config X86 select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP - select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 - select CLKSRC_I8253 if X86_32 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select CLONE_BACKWARDS if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT @@ -77,7 +97,6 @@ config X86 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE @@ -88,12 +107,11 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER - select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_WITHIN_STACK_FRAMES - select HAVE_EBPF_JIT if X86_64 + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_VMAP_STACK if X86_64 + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@ -106,6 +124,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EBPF_JIT if X86_64 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 @@ -113,7 +132,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT @@ -142,15 +160,11 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING - select MODULES_USE_ELF_RELA if X86_64 - select MODULES_USE_ELF_REL if X86_32 - select OLD_SIGACTION if X86_32 - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select PERF_EVENTS select RTC_LIB select RTC_MC146818_LIB @@ -160,11 +174,7 @@ config X86 select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS - select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS - select HAVE_STACK_VALIDATION if X86_64 - select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS - select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS config INSTRUCTION_DECODER def_bool y @@ -304,9 +314,6 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y -config DEBUG_RODATA - def_bool y - config PGTABLE_LEVELS int default 4 if X86_64 @@ -407,6 +414,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config INTEL_RDT_A + bool "Intel Resource Director Technology Allocation support" + default n + depends on X86 && CPU_SUP_INTEL + select KERNFS + help + Select to enable resource allocation which is a sub-feature of + Intel Resource Director Technology(RDT). More information about + RDT can be found in the Intel x86 Architecture Software + Developer Manual. + + Say N if unsure. + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" @@ -550,18 +570,6 @@ config X86_INTEL_QUARK Say Y here if you have a Quark based system such as the Arduino compatible Intel Galileo. -config MLX_PLATFORM - tristate "Mellanox Technologies platform support" - depends on X86_64 - depends on X86_EXTENDED_PLATFORM - ---help--- - This option enables system support for the Mellanox Technologies - platform. - - Say Y here if you are building a kernel for Mellanox system. - - Otherwise, say N. - config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on X86 && ACPI @@ -939,6 +947,27 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC_PRIO + bool "CPU core priorities scheduler support" + depends on SCHED_MC && CPU_SUP_INTEL + select X86_INTEL_PSTATE + select CPU_FREQ + default y + ---help--- + Intel Turbo Boost Max Technology 3.0 enabled CPUs have a + core ordering determined at manufacturing time, which allows + certain cores to reach higher turbo frequencies (when running + single threaded workloads) than others. + + Enabling this kernel feature teaches the scheduler about + the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the + scheduler's CPU selection logic accordingly, so that higher + overall system performance can be achieved. + + This feature will have no effect on CPUs without this feature. + + If unsure say Y here. + source "kernel/Kconfig.preempt" config UP_LATE_INIT @@ -1025,7 +1054,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && AMD_NB ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -1043,7 +1072,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE + depends on X86_MCE && X86_LOCAL_APIC tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. @@ -1525,7 +1554,7 @@ config X86_CHECK_BIOS_CORRUPTION line. By default it scans the low 64k of memory every 60 seconds; see the memory_corruption_check_size and memory_corruption_check_period parameters in - Documentation/kernel-parameters.txt to adjust this. + Documentation/admin-guide/kernel-parameters.rst to adjust this. When enabled with the default parameters, this option has almost no overhead, as it reserves a relatively small amount @@ -1737,6 +1766,8 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS def_bool y # Note: only available in 64-bit mode depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS ---help--- Memory Protection Keys provides a mechanism for enforcing page-based protections, but without requiring modification of the @@ -1965,10 +1996,6 @@ config RANDOMIZE_BASE theoretically possible, but the implementations are further limited due to memory layouts. - If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot - time. To enable it, boot with "kaslr" on the kernel command - line (which will also disable hibernation). - If unsure, say N. # Relocation on x86 needs some additional build support @@ -2092,7 +2119,7 @@ config DEBUG_HOTPLUG_CPU0 config COMPAT_VDSO def_bool n prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" - depends on X86_32 || IA32_EMULATION + depends on COMPAT_32 ---help--- Certain buggy versions of glibc will crash if they are presented with a 32-bit vDSO that is not mapped at the address @@ -2694,9 +2721,10 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF select COMPAT_BINFMT_ELF - select ARCH_WANT_OLD_COMPAT_IPC + select COMPAT_OLD_SIGACTION ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2721,6 +2749,12 @@ config X86_X32 elf32_x86_64 support enabled to compile a kernel with this option set. +config COMPAT_32 + def_bool y + depends on IA32_EMULATION || X86_32 + select HAVE_UID16 + select OLD_SIGSUSPEND3 + config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 @@ -2753,10 +2787,6 @@ config X86_DMA_REMAP bool depends on STA2X11 -config PMC_ATOM - def_bool y - depends on PCI - source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 67eec55093a5..63c1d13aaf9f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -74,14 +74,6 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_RODATA_TEST - bool "Testcase for the marking rodata read-only" - default y - ---help--- - This option enables a testcase for the setting rodata read-only - as well as for the change_page_attr() infrastructure. - If in doubt, say "N" - config DEBUG_WX bool "Warn on W+X mappings at boot" select X86_PTDUMP_CORE @@ -109,25 +101,6 @@ config DEBUG_WX If in doubt, say "Y". -config DEBUG_SET_MODULE_RONX - bool "Set loadable kernel module data as NX and text as RO" - depends on MODULES - ---help--- - This option helps catch unintended modifications to loadable - kernel module's text and read-only data. It also prevents execution - of module data. Such protection may interfere with run-time code - patching and dynamic kernel tracing - and they might also protect - against certain classes of kernel exploits. - If in doubt, say "N". - -config DEBUG_NX_TEST - tristate "Testcase for the NX non-executable stack feature" - depends on DEBUG_KERNEL && m - ---help--- - This option enables a testcase for the CPU NX capability - and the software setup of this feature. - If in doubt, say "N" - config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EXPERT diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 12ea8f8384f4..0d810fb15eac 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -65,7 +65,7 @@ clean-files += cpustr.h # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE := n diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index e5612f3e3b57..9b42b6d1e902 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -333,6 +333,7 @@ size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); size_t strlen(const char *s); +char *strchr(const char *s, int c); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 34d9e15857c3..44163e8c3868 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cc69e37548db..801c7a158e55 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -32,160 +32,13 @@ static void setup_boot_services##bits(struct efi_config *c) \ \ table = (typeof(table))sys_table; \ \ + c->runtime_services = table->runtime; \ c->boot_services = table->boottime; \ c->text_output = table->con_out; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); -void efi_char16_printk(efi_system_table_t *, efi_char16_t *); - -static efi_status_t -__file_size32(void *__fh, efi_char16_t *filename_16, - void **handle, u64 *file_sz) -{ - efi_file_handle_32_t *h, *fh = __fh; - efi_file_info_t *info; - efi_status_t status; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - u32 info_sz; - - status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to open file: "); - efi_char16_printk(sys_table, filename_16); - efi_printk(sys_table, "\n"); - return status; - } - - *handle = h; - - info_sz = 0; - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk(sys_table, "Failed to get file info size\n"); - return status; - } - -grow: - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - info_sz, (void **)&info); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for file info\n"); - return status; - } - - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_early(free_pool, info); - goto grow; - } - - *file_sz = info->file_size; - efi_call_early(free_pool, info); - - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to get initrd info\n"); - - return status; -} - -static efi_status_t -__file_size64(void *__fh, efi_char16_t *filename_16, - void **handle, u64 *file_sz) -{ - efi_file_handle_64_t *h, *fh = __fh; - efi_file_info_t *info; - efi_status_t status; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - u64 info_sz; - - status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to open file: "); - efi_char16_printk(sys_table, filename_16); - efi_printk(sys_table, "\n"); - return status; - } - - *handle = h; - - info_sz = 0; - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk(sys_table, "Failed to get file info size\n"); - return status; - } - -grow: - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - info_sz, (void **)&info); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for file info\n"); - return status; - } - - status = efi_early->call((unsigned long)h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_early(free_pool, info); - goto grow; - } - - *file_sz = info->file_size; - efi_call_early(free_pool, info); - - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to get initrd info\n"); - - return status; -} -efi_status_t -efi_file_size(efi_system_table_t *sys_table, void *__fh, - efi_char16_t *filename_16, void **handle, u64 *file_sz) -{ - if (efi_early->is64) - return __file_size64(__fh, filename_16, handle, file_sz); - - return __file_size32(__fh, filename_16, handle, file_sz); -} - -efi_status_t -efi_file_read(void *handle, unsigned long *size, void *addr) -{ - unsigned long func; - - if (efi_early->is64) { - efi_file_handle_64_t *fh = handle; - - func = (unsigned long)fh->read; - return efi_early->call(func, handle, size, addr); - } else { - efi_file_handle_32_t *fh = handle; - - func = (unsigned long)fh->read; - return efi_early->call(func, handle, size, addr); - } -} - -efi_status_t efi_file_close(void *handle) -{ - if (efi_early->is64) { - efi_file_handle_64_t *fh = handle; - - return efi_early->call((unsigned long)fh->close, handle); - } else { - efi_file_handle_32_t *fh = handle; - - return efi_early->call((unsigned long)fh->close, handle); - } -} - static inline efi_status_t __open_volume32(void *__image, void **__fh) { efi_file_io_interface_t *io; @@ -249,30 +102,8 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { - unsigned long output_string; - size_t offset; - - if (efi_early->is64) { - struct efi_simple_text_output_protocol_64 *out; - u64 *func; - - offset = offsetof(typeof(*out), output_string); - output_string = efi_early->text_output + offset; - out = (typeof(out))(unsigned long)efi_early->text_output; - func = (u64 *)output_string; - - efi_early->call(*func, out, str); - } else { - struct efi_simple_text_output_protocol_32 *out; - u32 *func; - - offset = offsetof(typeof(*out), output_string); - output_string = efi_early->text_output + offset; - out = (typeof(out))(unsigned long)efi_early->text_output; - func = (u32 *)output_string; - - efi_early->call(*func, out, str); - } + efi_call_proto(efi_simple_text_output_protocol, output_string, + efi_early->text_output, str); } static efi_status_t @@ -537,6 +368,69 @@ free_handle: efi_call_early(free_pool, pci_handle); } +static void retrieve_apple_device_properties(struct boot_params *boot_params) +{ + efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID; + struct setup_data *data, *new; + efi_status_t status; + u32 size = 0; + void *p; + + status = efi_call_early(locate_protocol, &guid, NULL, &p); + if (status != EFI_SUCCESS) + return; + + if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) { + efi_printk(sys_table, "Unsupported properties proto version\n"); + return; + } + + efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size); + if (!size) + return; + + do { + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size + sizeof(struct setup_data), &new); + if (status != EFI_SUCCESS) { + efi_printk(sys_table, + "Failed to alloc mem for properties\n"); + return; + } + + status = efi_call_proto(apple_properties_protocol, get_all, p, + new->data, &size); + + if (status == EFI_BUFFER_TOO_SMALL) + efi_call_early(free_pool, new); + } while (status == EFI_BUFFER_TOO_SMALL); + + new->type = SETUP_APPLE_PROPERTIES; + new->len = size; + new->next = 0; + + data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + if (!data) + boot_params->hdr.setup_data = (unsigned long)new; + else { + while (data->next) + data = (struct setup_data *)(unsigned long)data->next; + data->next = (unsigned long)new; + } +} + +static void setup_quirks(struct boot_params *boot_params) +{ + efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 }; + efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) + efi_table_attr(efi_system_table, fw_vendor, sys_table); + + if (!memcmp(fw_vendor, apple, sizeof(apple))) { + if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) + retrieve_apple_device_properties(boot_params); + } +} + static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1094,10 +988,19 @@ struct boot_params *efi_main(struct efi_config *c, else setup_boot_services32(efi_early); + /* + * If the boot loader gave us a value for secure_boot then we use that, + * otherwise we ask the BIOS. + */ + if (boot_params->secure_boot == efi_secureboot_mode_unset) + boot_params->secure_boot = efi_get_secureboot(sys_table); + setup_graphics(boot_params); setup_efi_pci(boot_params); + setup_quirks(boot_params); + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index fd0b6a272dd5..d85b9625e836 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -82,7 +82,7 @@ ENTRY(efi_pe_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 32(%eax) + add %esi, 40(%eax) pushl %eax call make_boot_params @@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 32(%eax) + add %esi, 40(%eax) pushl %eax 2: call efi_main @@ -264,7 +264,7 @@ relocated: #ifdef CONFIG_EFI_STUB .data efi32_config: - .fill 4,8,0 + .fill 5,8,0 .long efi_call_phys .long 0 .byte 0 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index efdfba21a5b2..d2ae1f821e0c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -119,8 +119,7 @@ ENTRY(startup_32) */ /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax - movl %eax, gdt+2(%ebp) + addl %ebp, gdt+2(%ebp) lgdt gdt(%ebp) /* Enable PAE mode */ @@ -265,7 +264,7 @@ ENTRY(efi_pe_entry) /* * Relocate efi_config->call(). */ - addq %rbp, efi64_config+32(%rip) + addq %rbp, efi64_config+40(%rip) movq %rax, %rdi call make_boot_params @@ -285,7 +284,7 @@ handover_entry: * Relocate efi_config->call(). */ movq efi_config(%rip), %rax - addq %rbp, 32(%rax) + addq %rbp, 40(%rax) 2: movq efi_config(%rip), %rdi call efi_main @@ -457,14 +456,14 @@ efi_config: #ifdef CONFIG_EFI_MIXED .global efi32_config efi32_config: - .fill 4,8,0 + .fill 5,8,0 .quad efi64_thunk .byte 0 #endif .global efi64_config efi64_config: - .fill 4,8,0 + .fill 5,8,0 .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a66854d99ee1..8b7c9e75edcb 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -11,6 +11,7 @@ */ #include "misc.h" #include "error.h" +#include "../boot.h" #include <generated/compile.h> #include <linux/module.h> @@ -52,15 +53,22 @@ static unsigned long get_boot_seed(void) #include "../../lib/kaslr.c" struct mem_vector { - unsigned long start; - unsigned long size; + unsigned long long start; + unsigned long long size; }; +/* Only supporting at most 4 unusable memmap regions with kaslr */ +#define MAX_MEMMAP_REGIONS 4 + +static bool memmap_too_large; + enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MEMMAP_BEGIN, + MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, MEM_AVOID_MAX, }; @@ -77,6 +85,123 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } +/** + * _memparse - Parse a string with mem suffixes into a number + * @ptr: Where parse begins + * @retptr: (output) Optional pointer to next char after parse completes + * + * Parses a string into a number. The number stored at @ptr is + * potentially suffixed with K, M, G, T, P, E. + */ +static unsigned long long _memparse(const char *ptr, char **retptr) +{ + char *endptr; /* Local pointer to end of parsed string */ + + unsigned long long ret = simple_strtoull(ptr, &endptr, 0); + + switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; + case 'G': + case 'g': + ret <<= 10; + case 'M': + case 'm': + ret <<= 10; + case 'K': + case 'k': + ret <<= 10; + endptr++; + default: + break; + } + + if (retptr) + *retptr = endptr; + + return ret; +} + +static int +parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +{ + char *oldp; + + if (!p) + return -EINVAL; + + /* We don't care about this option here */ + if (!strncmp(p, "exactmap", 8)) + return -EINVAL; + + oldp = p; + *size = _memparse(p, &p); + if (p == oldp) + return -EINVAL; + + switch (*p) { + case '@': + /* Skip this region, usable */ + *start = 0; + *size = 0; + return 0; + case '#': + case '$': + case '!': + *start = _memparse(p + 1, &p); + return 0; + } + + return -EINVAL; +} + +static void mem_avoid_memmap(void) +{ + char arg[128]; + int rc; + int i; + char *str; + + /* See if we have any memmap areas */ + rc = cmdline_find_option("memmap", arg, sizeof(arg)); + if (rc <= 0) + return; + + i = 0; + str = arg; + while (str && (i < MAX_MEMMAP_REGIONS)) { + int rc; + unsigned long long start, size; + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + rc = parse_memmap(str, &start, &size); + if (rc < 0) + break; + str = k; + /* A usable region that should not be skipped */ + if (size == 0) + continue; + + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; + i++; + } + + /* More than 4 memmaps, fail kaslr */ + if ((i >= MAX_MEMMAP_REGIONS) && str) + memmap_too_large = true; +} + /* * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). * The mem_avoid array is used to store the ranges that need to be avoided @@ -197,6 +322,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ + /* Mark the memmap regions we need to avoid */ + mem_avoid_memmap(); + #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ add_identity_map(0, PMD_SIZE); @@ -379,6 +507,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum, int i; unsigned long addr; + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); + return 0; + } + /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); @@ -456,7 +590,7 @@ void choose_random_location(unsigned long input, /* Walk e820 and find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { - warn("KASLR disabled: could not find suitable E820 region!"); + warn("Physical KASLR disabled: no suitable memory region!"); } else { /* Update the new physical address location. */ if (*output != random_addr) { diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 4224ede43b4e..26240dde081e 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -87,12 +87,6 @@ int validate_cpu(void) return -1; } - if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) && - !has_eflag(X86_EFLAGS_ID)) { - printf("This kernel requires a CPU with the CPUID instruction. Build with CONFIG_M486=y to run on this CPU.\n"); - return -1; - } - if (err_flags) { puts("This kernel requires the following features " "not present on the CPU:\n"); diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd583dce1..5457b02fc050 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { @@ -155,3 +156,16 @@ char *strstr(const char *s1, const char *s2) } return NULL; } + +/** + * strchr - Find the first occurrence of the character c in the string s. + * @s: the string to be searched + * @c: the character to search for + */ +char *strchr(const char *s, int c) +{ + while (*s != (char)c) + if (*s++ == '\0') + return NULL; + return (char *)s; +} diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 383a6f84a060..3c465184ff8a 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -46,28 +46,49 @@ #ifdef __x86_64__ -.data +# constants in mergeable sections, linker can reorder and merge +.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 .align 16 .Lgf128mul_x_ble_mask: .octa 0x00000000000000010000000000000087 +.section .rodata.cst16.POLY, "aM", @progbits, 16 +.align 16 POLY: .octa 0xC2000000000000000000000000000001 +.section .rodata.cst16.TWOONE, "aM", @progbits, 16 +.align 16 TWOONE: .octa 0x00000001000000000000000000000001 -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and ZERO should follow ALL_F - +.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 +.align 16 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +.section .rodata.cst16.MASK1, "aM", @progbits, 16 +.align 16 MASK1: .octa 0x0000000000000000ffffffffffffffff +.section .rodata.cst16.MASK2, "aM", @progbits, 16 +.align 16 MASK2: .octa 0xffffffffffffffff0000000000000000 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff -ZERO: .octa 0x00000000000000000000000000000000 +.section .rodata.cst16.ONE, "aM", @progbits, 16 +.align 16 ONE: .octa 0x00000000000000000000000000000001 +.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 +.align 16 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 +.section .rodata.cst16.dec, "aM", @progbits, 16 +.align 16 dec: .octa 0x1 +.section .rodata.cst16.enc, "aM", @progbits, 16 +.align 16 enc: .octa 0x2 +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, +# and zero should follow ALL_F +.section .rodata, "a", @progbits +.align 16 +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff + .octa 0x00000000000000000000000000000000 + .text diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 522ab68d1c88..d664382c6e56 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -122,23 +122,39 @@ #include <linux/linkage.h> #include <asm/inst.h> -.data +# constants in mergeable sections, linker can reorder and merge +.section .rodata.cst16.POLY, "aM", @progbits, 16 .align 16 - POLY: .octa 0xC2000000000000000000000000000001 + +.section .rodata.cst16.POLY2, "aM", @progbits, 16 +.align 16 POLY2: .octa 0xC20000000000000000000001C2000000 -TWOONE: .octa 0x00000001000000000000000000000001 -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F +.section .rodata.cst16.TWOONE, "aM", @progbits, 16 +.align 16 +TWOONE: .octa 0x00000001000000000000000000000001 +.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 +.align 16 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff -ZERO: .octa 0x00000000000000000000000000000000 + +.section .rodata.cst16.ONE, "aM", @progbits, 16 +.align 16 ONE: .octa 0x00000000000000000000000000000001 + +.section .rodata.cst16.ONEf, "aM", @progbits, 16 +.align 16 ONEf: .octa 0x01000000000000000000000000000000 +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F +.section .rodata, "a", @progbits +.align 16 +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff + .octa 0x00000000000000000000000000000000 + .text diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index aa8b0672f87a..93de8ea51548 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> @@ -29,14 +28,14 @@ #include <crypto/cryptd.h> #include <crypto/ctr.h> #include <crypto/b128ops.h> -#include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/fpu/api.h> #include <asm/crypto/aes.h> -#include <crypto/ablk_helper.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #ifdef CONFIG_X86_64 @@ -45,28 +44,26 @@ #define AESNI_ALIGN 16 +#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) #define RFC4106_HASH_SUBKEY_SIZE 16 +#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) +#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. */ struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN))); - struct crypto_aes_ctx aes_key_expanded - __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; u8 nonce[4]; }; -struct aesni_lrw_ctx { - struct lrw_table_ctx lrw_table; - u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; -}; - struct aesni_xts_ctx { - u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; - u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -360,96 +357,95 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) aesni_dec(ctx, dst, src); } -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) +{ + return aes_set_key_common(crypto_skcipher_tfm(tfm), + crypto_skcipher_ctx(tfm), key, len); +} + +static int ecb_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); @@ -458,7 +454,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, #ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct blkcipher_walk *walk) + struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u8 keystream[AES_BLOCK_SIZE]; @@ -491,157 +487,53 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, } #endif -static int ctr_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { ctr_crypt_final(ctx, &walk); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } kernel_fpu_end(); return err; } -#endif - -static int ablk_ecb_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); -} - -static int ablk_cbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); -} - -#ifdef CONFIG_X86_64 -static int ablk_ctr_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); -} - -#endif - -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) -static int ablk_pcbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); -} -#endif - -static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_enc(ctx, blks, blks, nbytes); -} -static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_dec(ctx, blks, blks, nbytes); -} - -static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err; - err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, - keylen - AES_BLOCK_SIZE); + err = xts_verify_key(tfm, key, keylen); if (err) return err; - return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); -} - -static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) -{ - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; + keylen /= 2; /* first half of xts-key is for crypt */ - err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); if (err) return err; /* second half of xts-key is for tweak */ - return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, - keylen / 2); + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); } @@ -650,8 +542,6 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) aesni_enc(ctx, out, in); } -#ifdef CONFIG_X86_64 - static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); @@ -698,83 +588,28 @@ static const struct common_glue_ctx aesni_dec_xts = { } } }; -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + return glue_xts_req_128bit(&aesni_enc_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + return glue_xts_req_128bit(&aesni_dec_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -#else - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -#endif - -#ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_aead *aead) { struct cryptd_aead *cryptd_tfm; @@ -905,9 +740,11 @@ static int helper_rfc4106_encrypt(struct aead_request *req) *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && - req->src->offset + req->src->length <= PAGE_SIZE && + (!PageHighMem(sg_page(req->src)) || + req->src->offset + req->src->length <= PAGE_SIZE) && sg_is_last(req->dst) && - req->dst->offset + req->dst->length <= PAGE_SIZE) { + (!PageHighMem(sg_page(req->dst)) || + req->dst->offset + req->dst->length <= PAGE_SIZE)) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); assoc = scatterwalk_map(&src_sg_walk); @@ -987,9 +824,11 @@ static int helper_rfc4106_decrypt(struct aead_request *req) *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && - req->src->offset + req->src->length <= PAGE_SIZE && + (!PageHighMem(sg_page(req->src)) || + req->src->offset + req->src->length <= PAGE_SIZE) && sg_is_last(req->dst) && - req->dst->offset + req->dst->length <= PAGE_SIZE) { + (!PageHighMem(sg_page(req->dst)) || + req->dst->offset + req->dst->length <= PAGE_SIZE)) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); assoc = scatterwalk_map(&src_sg_walk); @@ -1077,9 +916,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1091,14 +928,12 @@ static struct crypto_alg aesni_algs[] = { { } } }, { - .cra_name = "__aes-aesni", - .cra_driver_name = "__driver-aes-aesni", - .cra_priority = 0, + .cra_name = "__aes", + .cra_driver_name = "__aes-aesni", + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1109,250 +944,95 @@ static struct crypto_alg aesni_algs[] = { { .cia_decrypt = __aes_decrypt } } -}, { - .cra_name = "__ecb-aes-aesni", - .cra_driver_name = "__driver-ecb-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-aes-aesni", - .cra_driver_name = "__driver-cbc-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ecb_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, +} }; + +static struct skcipher_alg aesni_skciphers[] = { + { + .base = { + .cra_name = "__ecb(aes)", + .cra_driver_name = "__ecb-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_cbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(aes)", + .cra_driver_name = "__cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, #ifdef CONFIG_X86_64 -}, { - .cra_name = "__ctr-aes-aesni", - .cra_driver_name = "__driver-ctr-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__ctr(aes)", + .cra_driver_name = "__ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__xts(aes)", + .cra_driver_name = "__xts-aes-aesni", + .cra_priority = 401, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = XTS_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, #endif -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) -}, { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_pcbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, + } +}; + +struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; + +struct { + const char *algname; + const char *drvname; + const char *basename; + struct simd_skcipher_alg *simd; +} aesni_simd_skciphers2[] = { +#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \ + IS_BUILTIN(CONFIG_CRYPTO_PCBC) + { + .algname = "pcbc(aes)", + .drvname = "pcbc-aes-aesni", + .basename = "fpu(pcbc(__aes-aesni))", }, #endif -}, { - .cra_name = "__lrw-aes-aesni", - .cra_driver_name = "__driver-lrw-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_aesni_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = lrw_aesni_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-aes-aesni", - .cra_driver_name = "__driver-xts-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; +}; #ifdef CONFIG_X86_64 static struct aead_alg aesni_aead_algs[] = { { @@ -1401,9 +1081,27 @@ static const struct x86_cpu_id aesni_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); +static void aesni_free_simds(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) && + aesni_simd_skciphers[i]; i++) + simd_skcipher_free(aesni_simd_skciphers[i]); + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) + if (aesni_simd_skciphers2[i].simd) + simd_skcipher_free(aesni_simd_skciphers2[i].simd); +} + static int __init aesni_init(void) { + struct simd_skcipher_alg *simd; + const char *basename; + const char *algname; + const char *drvname; int err; + int i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1445,13 +1143,48 @@ static int __init aesni_init(void) if (err) goto fpu_exit; + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); + if (err) + goto unregister_algs; + err = crypto_register_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); if (err) - goto unregister_algs; + goto unregister_skciphers; + + for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) { + algname = aesni_skciphers[i].base.cra_name + 2; + drvname = aesni_skciphers[i].base.cra_driver_name + 2; + basename = aesni_skciphers[i].base.cra_driver_name; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + + aesni_simd_skciphers[i] = simd; + } - return err; + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) { + algname = aesni_simd_skciphers2[i].algname; + drvname = aesni_simd_skciphers2[i].drvname; + basename = aesni_simd_skciphers2[i].basename; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + continue; + aesni_simd_skciphers2[i].simd = simd; + } + + return 0; + +unregister_simds: + aesni_free_simds(); + crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); +unregister_skciphers: + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_algs: crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); fpu_exit: @@ -1461,7 +1194,10 @@ fpu_exit: static void __exit aesni_exit(void) { + aesni_free_simds(); crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index aa9e8bd163f6..f7c495e2863c 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -571,7 +571,9 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu y6, 14 * 16(rio); \ vmovdqu y7, 15 * 16(rio); -.data + +/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ +.section .rodata.cst16, "aM", @progbits, 16 .align 16 #define SHUFB_BYTES(idx) \ @@ -711,6 +713,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 /* 4-bit mask */ +.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 .align 4 .L0f0f0f0f: .long 0x0f0f0f0f diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 16186c18656d..eee5b3982cfd 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -610,20 +610,25 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu y6, 14 * 32(rio); \ vmovdqu y7, 15 * 32(rio); -.data -.align 32 +.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 +.align 32 #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) - .Lshufb_16x16b: .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) +.section .rodata.cst32.pack_bswap, "aM", @progbits, 32 +.align 32 .Lpack_bswap: .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 +/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 @@ -750,6 +755,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 +.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 .align 4 /* 4-bit mask */ .L0f0f0f0f: diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 14fa1966bf01..b4a8806234ea 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -195,19 +195,29 @@ vpshufb rmask, x0, x0; \ vpshufb rmask, x1, x1; -.data - +.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 .align 16 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 +.align 16 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16 +.align 16 .Lbswap_iv_mask: .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 + +.section .rodata.cst4.16_mask, "aM", @progbits, 4 +.align 4 .L16_mask: .byte 16, 16, 16, 16 +.section .rodata.cst4.32_mask, "aM", @progbits, 4 +.align 4 .L32_mask: .byte 32, 0, 0, 0 +.section .rodata.cst4.first_mask, "aM", @progbits, 4 +.align 4 .Lfirst_mask: .byte 0x1f, 0, 0, 0 diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index c419389889cd..952d3156a933 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -225,8 +225,7 @@ vpshufb rmask, x2, x2; \ vpshufb rmask, x3, x3; -.data - +.section .rodata.cst16, "aM", @progbits, 16 .align 16 .Lxts_gf128mul_and_shl1_mask: .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 @@ -244,10 +243,19 @@ .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.section .rodata.cst4.L16_mask, "aM", @progbits, 4 +.align 4 .L16_mask: .byte 16, 16, 16, 16 + +.section .rodata.cst4.L32_mask, "aM", @progbits, 4 +.align 4 .L32_mask: .byte 32, 0, 0, 0 + +.section .rodata.cst4.first_mask, "aM", @progbits, 4 +.align 4 .Lfirst_mask: .byte 0x1f, 0, 0, 0 diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 16694e625f77..3a2dc3dc6cac 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -11,13 +11,18 @@ #include <linux/linkage.h> -.data +.section .rodata.cst32.ROT8, "aM", @progbits, 32 .align 32 - ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 CTRINC: .octa 0x00000003000000020000000100000000 .octa 0x00000007000000060000000500000004 diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 3a33124e9112..3f511a7d73b8 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -11,11 +11,14 @@ #include <linux/linkage.h> -.data +.section .rodata.cst16.ROT8, "aM", @progbits, 16 .align 16 - ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 CTRINC: .octa 0x00000003000000020000000100000000 .text diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index f910d1d449f0..1e6af1b35f7b 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -11,7 +11,7 @@ #include <crypto/algapi.h> #include <crypto/chacha20.h> -#include <linux/crypto.h> +#include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> #include <asm/fpu/api.h> @@ -63,36 +63,37 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int chacha20_simd(struct skcipher_request *req) { - u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 *state, state_buf[16 + 2] __aligned(8); + struct skcipher_walk walk; int err; - if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(desc, dst, src, nbytes); + BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); + if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha20_crypt(req); - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + err = skcipher_walk_virt(&walk, req, true); - crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + crypto_chacha20_init(state, ctx, walk.iv); kernel_fpu_begin(); while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = blkcipher_walk_done(desc, &walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); + err = skcipher_walk_done(&walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); } if (walk.nbytes) { chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } kernel_fpu_end(); @@ -100,27 +101,22 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg alg = { - .cra_name = "chacha20", - .cra_driver_name = "chacha20-simd", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_type = &crypto_blkcipher_type, - .cra_ctxsize = sizeof(struct chacha20_ctx), - .cra_alignmask = sizeof(u32) - 1, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .geniv = "seqiv", - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, - }, +static struct skcipher_alg alg = { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_alignmask = sizeof(u32) - 1, + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .chunksize = CHACHA20_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, }; static int __init chacha20_simd_mod_init(void) @@ -133,12 +129,12 @@ static int __init chacha20_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif - return crypto_register_alg(&alg); + return crypto_register_skcipher(&alg); } static void __exit chacha20_simd_mod_fini(void) { - crypto_unregister_alg(&alg); + crypto_unregister_skcipher(&alg); } module_init(chacha20_simd_mod_init); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0857b1a1de3b..c194d5717ae5 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,26 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#if defined(X86_FEATURE_EAGER_FPU) -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) -#else -#define set_pcl_breakeven_point() \ - (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) -#endif #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -190,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -202,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -261,7 +248,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(&alg); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index dc05f010ca9b..7a7de27c6f41 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -312,7 +312,7 @@ do_return: ret ENDPROC(crc_pcl) -.section .rodata, "a", %progbits +.section .rodata, "a", @progbits ################################################################ ## jump table Table is 129 entries x 2 bytes each ################################################################ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 35e97569d05f..de04d3e98d8d 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -554,12 +554,11 @@ _only_less_than_2: ENDPROC(crc_t10dif_pcl) -.data - +.section .rodata, "a", @progbits +.align 16 # precomputed constants # these constants are precomputed from the poly: # 0x8bb70000 (0x8bb7 scaled to 32 bits) -.align 16 # Q = 0x18BB70000 # rk1 = 2^(32*3) mod Q << 32 # rk2 = 2^(32*5) mod Q << 32 @@ -613,14 +612,23 @@ rk20: +.section .rodata.cst16.mask1, "aM", @progbits, 16 +.align 16 mask1: .octa 0x80808080808080808080808080808080 + +.section .rodata.cst16.mask2, "aM", @progbits, 16 +.align 16 mask2: .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF +.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 +.align 16 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32 +.align 32 pshufb_shf_table: # use these values for shift constants for the pshufb instruction # different alignments result in values as shown: diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index 038f6ae87c5e..f3e91647ca27 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -537,7 +537,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) -.data +.section .rodata, "a", @progbits .align 16 .L_s1: .quad 0x0010100001010400, 0x0000000000000000 diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index e7d679e2a018..406680476c52 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -11,143 +11,186 @@ * */ -#include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> struct crypto_fpu_ctx { - struct crypto_blkcipher *child; + struct crypto_skcipher *child; }; -static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, +static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); - struct crypto_blkcipher *child = ctx->child; + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent); + struct crypto_skcipher *child = ctx->child; int err; - crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - err = crypto_blkcipher_setkey(child, key, keylen); - crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); + crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_skcipher_setkey(child, key, keylen); + crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); return err; } -static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_encrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_encrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_decrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_decrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm) { - struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); - struct crypto_spawn *spawn = crypto_instance_ctx(inst); - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - struct crypto_blkcipher *cipher; + struct skcipher_instance *inst = skcipher_alg_instance(tfm); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher_spawn *spawn; + struct crypto_skcipher *cipher; - cipher = crypto_spawn_blkcipher(spawn); + spawn = skcipher_instance_ctx(inst); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; + return 0; } -static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(ctx->child); +} + +static void crypto_fpu_free(struct skcipher_instance *inst) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(ctx->child); + crypto_drop_skcipher(skcipher_instance_ctx(inst)); + kfree(inst); } -static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb) { - struct crypto_instance *inst; - struct crypto_alg *alg; + struct crypto_skcipher_spawn *spawn; + struct skcipher_instance *inst; + struct crypto_attr_type *algt; + struct skcipher_alg *alg; + const char *cipher_name; int err; - err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + algt = crypto_get_attr_type(tb); + if (IS_ERR(algt)) + return PTR_ERR(algt); + + if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) & + algt->mask) + return -EINVAL; + + if (!(algt->mask & CRYPTO_ALG_INTERNAL)) + return -EINVAL; + + cipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(cipher_name)) + return PTR_ERR(cipher_name); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + spawn = skcipher_instance_ctx(inst); + + crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst)); + err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC); if (err) - return ERR_PTR(err); - - alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, - CRYPTO_ALG_TYPE_MASK); - if (IS_ERR(alg)) - return ERR_CAST(alg); - - inst = crypto_alloc_instance("fpu", alg); - if (IS_ERR(inst)) - goto out_put_alg; - - inst->alg.cra_flags = alg->cra_flags; - inst->alg.cra_priority = alg->cra_priority; - inst->alg.cra_blocksize = alg->cra_blocksize; - inst->alg.cra_alignmask = alg->cra_alignmask; - inst->alg.cra_type = alg->cra_type; - inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; - inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; - inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; - inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); - inst->alg.cra_init = crypto_fpu_init_tfm; - inst->alg.cra_exit = crypto_fpu_exit_tfm; - inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; - inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; - inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; - -out_put_alg: - crypto_mod_put(alg); - return inst; -} + goto out_free_inst; -static void crypto_fpu_free(struct crypto_instance *inst) -{ - crypto_drop_spawn(crypto_instance_ctx(inst)); + alg = crypto_skcipher_spawn_alg(spawn); + + err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu", + &alg->base); + if (err) + goto out_drop_skcipher; + + inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL; + inst->alg.base.cra_priority = alg->base.cra_priority; + inst->alg.base.cra_blocksize = alg->base.cra_blocksize; + inst->alg.base.cra_alignmask = alg->base.cra_alignmask; + + inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg); + inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg); + inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg); + + inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + + inst->alg.init = crypto_fpu_init_tfm; + inst->alg.exit = crypto_fpu_exit_tfm; + + inst->alg.setkey = crypto_fpu_setkey; + inst->alg.encrypt = crypto_fpu_encrypt; + inst->alg.decrypt = crypto_fpu_decrypt; + + inst->free = crypto_fpu_free; + + err = skcipher_register_instance(tmpl, inst); + if (err) + goto out_drop_skcipher; + +out: + return err; + +out_drop_skcipher: + crypto_drop_skcipher(spawn); +out_free_inst: kfree(inst); + goto out; } static struct crypto_template crypto_fpu_tmpl = { .name = "fpu", - .alloc = crypto_fpu_alloc, - .free = crypto_fpu_free, + .create = crypto_fpu_create, .module = THIS_MODULE, }; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index eed55c8cca4f..f94375a8dcd1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -20,8 +20,7 @@ #include <asm/inst.h> #include <asm/frame.h> -.data - +.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 6a85598931b5..260a060d7275 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -27,10 +27,10 @@ #include <linux/module.h> #include <crypto/b128ops.h> +#include <crypto/internal/skcipher.h> #include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, @@ -339,6 +339,41 @@ done: return nbytes; } +static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, + void *ctx, + struct skcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = walk->src.virt.addr; + u128 *dst = walk->dst.virt.addr; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.xts(ctx, dst, src, + walk->iv); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + return nbytes; +} + /* for implementations implementing faster XTS IV generator */ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, @@ -379,6 +414,43 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); +int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx) +{ + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; + bool fpu_enabled = false; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + /* set minimum length to bsize, for tweak_fn */ + fpu_enabled = glue_skwalk_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, + nbytes < bsize ? bsize : nbytes); + + /* calculate first value of T */ + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { + nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + + err = skcipher_walk_done(&walk, nbytes); + nbytes = walk.nbytes; + } + + glue_fpu_end(fpu_enabled); + + return err; +} +EXPORT_SYMBOL_GPL(glue_xts_req_128bit); + void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn) { diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index eff2f414e22b..3b6e70d085da 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -11,11 +11,13 @@ #include <linux/linkage.h> -.data +.section .rodata.cst32.ANMASK, "aM", @progbits, 32 .align 32 - ANMASK: .octa 0x0000000003ffffff0000000003ffffff .octa 0x0000000003ffffff0000000003ffffff + +.section .rodata.cst32.ORMASK, "aM", @progbits, 32 +.align 32 ORMASK: .octa 0x00000000010000000000000001000000 .octa 0x00000000010000000000000001000000 diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index 338c748054ed..c88c670cb5fc 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -11,10 +11,12 @@ #include <linux/linkage.h> -.data +.section .rodata.cst16.ANMASK, "aM", @progbits, 16 .align 16 - ANMASK: .octa 0x0000000003ffffff0000000003ffffff + +.section .rodata.cst16.ORMASK, "aM", @progbits, 16 +.align 16 ORMASK: .octa 0x00000000010000000000000001000000 .text diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 8be571808342..2925077f8c6a 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -29,11 +29,12 @@ .file "serpent-avx-x86_64-asm_64.S" -.data +.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 .align 16 - .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 +.align 16 .Lxts_gf128mul_and_shl1_mask: .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 97c48add33ed..d67888f2a52a 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -20,13 +20,18 @@ .file "serpent-avx2-asm_64.S" -.data +.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 .align 16 - .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 +.align 16 .Lxts_gf128mul_and_shl1_mask_0: .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 + +.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 +.align 16 .Lxts_gf128mul_and_shl1_mask_1: .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index 9e5b67127a09..acf9fdf01671 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -114,7 +114,7 @@ static inline void sha1_init_digest(uint32_t *digest) } static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index 98a35bcc6f4a..13590ccf965c 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -125,7 +125,7 @@ struct sha1_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S index 96df6a39d7e2..93b945597ecf 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S @@ -281,11 +281,13 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2) ret ENDPROC(sha1_mb_mgr_get_comp_job_avx2) -.data - +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 .align 16 clear_low_nibble: .octa 0x000000000000000000000000FFFFFFF0 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 one: .quad 1 two: diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S index 63a0d9c8e31f..7a93b1c0d69a 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S @@ -203,8 +203,7 @@ return_null: ENDPROC(sha1_mb_mgr_submit_avx2) -.data - +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 .align 16 clear_low_nibble: .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S index c9dae1cd2919..20f77aa633de 100644 --- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S +++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S @@ -461,21 +461,32 @@ lloop: ENDPROC(sha1_x8_avx2) -.data - +.section .rodata.cst32.K00_19, "aM", @progbits, 32 .align 32 K00_19: .octa 0x5A8279995A8279995A8279995A827999 .octa 0x5A8279995A8279995A8279995A827999 + +.section .rodata.cst32.K20_39, "aM", @progbits, 32 +.align 32 K20_39: .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + +.section .rodata.cst32.K40_59, "aM", @progbits, 32 +.align 32 K40_59: .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + +.section .rodata.cst32.K60_79, "aM", @progbits, 32 +.align 32 K60_79: .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 874a651b9e7d..ebbdba72ae07 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -293,10 +293,12 @@ ENTRY(sha1_ni_transform) ret ENDPROC(sha1_ni_transform) -.data - -.align 64 +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f + +.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 +.align 16 UPPER_WORD_MASK: .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 92b3b5d75ba9..e08888a1a5f2 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -463,7 +463,7 @@ done_hash: ret ENDPROC(sha256_transform_avx) -.data +.section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 @@ -483,14 +483,21 @@ K256: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 # shuffle xBxA -> 00BA _SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 # shuffle xDxC -> DC00 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF + #endif diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 570ec5ec62d7..89c8f09787d2 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -723,7 +723,7 @@ done_hash: ret ENDPROC(sha256_transform_rorx) -.data +.section .rodata.cst512.K256, "aM", @progbits, 512 .align 64 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 @@ -759,14 +759,21 @@ K256: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 # shuffle xBxA -> 00BA +.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 +.align 32 _SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 # shuffle xDxC -> DC00 +.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 +.align 32 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF + #endif diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 6f97fb33ae21..7926a226b120 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -115,7 +115,7 @@ inline void sha256_init_digest(uint32_t *digest) } inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h index edd252b73206..aabb30320af0 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -125,7 +125,7 @@ struct sha256_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index a78a0694ddef..8fe6338bcc84 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -284,11 +284,13 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) ret ENDPROC(sha256_mb_mgr_get_comp_job_avx2) -.data - +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 .align 16 clear_low_nibble: .octa 0x000000000000000000000000FFFFFFF0 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 one: .quad 1 two: diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S index 7ea670e25acc..b36ae7454084 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S @@ -208,8 +208,7 @@ return_null: ENDPROC(sha256_mb_mgr_submit_avx2) -.data - +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 .align 16 clear_low_nibble: .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S index aa21aea4c722..1687c80c5995 100644 --- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S @@ -437,7 +437,8 @@ Lrounds_16_xx: ret ENDPROC(sha256_x8_avx2) -.data + +.section .rodata.K256_8, "a", @progbits .align 64 K256_8: .octa 0x428a2f98428a2f98428a2f98428a2f98 @@ -568,10 +569,14 @@ K256_8: .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7 .octa 0xc67178f2c67178f2c67178f2c67178f2 .octa 0xc67178f2c67178f2c67178f2c67178f2 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 .octa 0x0c0d0e0f08090a0b0405060700010203 +.section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 .global K256 K256: diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 2cedc44e8121..39b83c93e7fd 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -474,7 +474,7 @@ done_hash: ret ENDPROC(sha256_transform_ssse3) -.data +.section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 @@ -494,13 +494,19 @@ K256: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 +.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 +.align 16 # shuffle xBxA -> 00BA _SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 +.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 +.align 16 # shuffle xDxC -> DC00 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 748cdf21a938..fb58f58ecfbc 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -329,7 +329,7 @@ ENTRY(sha256_ni_transform) ret ENDPROC(sha256_ni_transform) -.data +.section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 @@ -349,5 +349,7 @@ K256: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 565274d6a641..39235fefe6f7 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -370,14 +370,17 @@ ENDPROC(sha512_transform_avx) ######################################################################## ### Binary Data -.data - +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 .align 16 - # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. XMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 # K[t] used in SHA512 hashing K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 1f20b35d8573..7f5f6c6ec72e 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -684,8 +684,11 @@ ENDPROC(sha512_transform_rorx) ######################################################################## ### Binary Data -.data +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 .align 64 # K[t] used in SHA512 hashing K512: @@ -730,14 +733,17 @@ K512: .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 .align 32 - # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 .octa 0x18191a1b1c1d1e1f1011121314151617 +.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 +.align 32 MASK_YMM_LO: .octa 0x00000000000000000000000000000000 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF + #endif diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index d210174a52b0..2dd3674b5a1e 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -117,7 +117,7 @@ inline void sha512_init_digest(uint64_t *digest) } inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); @@ -221,7 +221,7 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit } static struct sha512_hash_ctx - *sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr) + *sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate) { /* * If get_comp_job returns NULL, there are no jobs complete. @@ -233,11 +233,17 @@ static struct sha512_hash_ctx * Otherwise, all jobs currently being managed by the hash_ctx_mgr * still need processing. */ + struct sha512_ctx_mgr *mgr; struct sha512_hash_ctx *ctx; + unsigned long flags; + mgr = cstate->mgr; + spin_lock_irqsave(&cstate->work_lock, flags); ctx = (struct sha512_hash_ctx *) sha512_job_mgr_get_comp_job(&mgr->mgr); - return sha512_ctx_mgr_resubmit(mgr, ctx); + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + spin_unlock_irqrestore(&cstate->work_lock, flags); + return ctx; } static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr) @@ -246,12 +252,17 @@ static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr) } static struct sha512_hash_ctx - *sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr, + *sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate, struct sha512_hash_ctx *ctx, const void *buffer, uint32_t len, int flags) { + struct sha512_ctx_mgr *mgr; + unsigned long irqflags; + + mgr = cstate->mgr; + spin_lock_irqsave(&cstate->work_lock, irqflags); if (flags & (~HASH_ENTIRE)) { /* * User should not pass anything other than FIRST, UPDATE, or @@ -351,20 +362,26 @@ static struct sha512_hash_ctx } } - return sha512_ctx_mgr_resubmit(mgr, ctx); + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + spin_unlock_irqrestore(&cstate->work_lock, irqflags); + return ctx; } -static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr) +static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate) { + struct sha512_ctx_mgr *mgr; struct sha512_hash_ctx *ctx; + unsigned long flags; + mgr = cstate->mgr; + spin_lock_irqsave(&cstate->work_lock, flags); while (1) { ctx = (struct sha512_hash_ctx *) sha512_job_mgr_flush(&mgr->mgr); /* If flush returned 0, there are no more jobs in flight. */ if (!ctx) - return NULL; + break; /* * If flush returned a job, resubmit the job to finish @@ -378,8 +395,10 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr) * the sha512_ctx_mgr still need processing. Loop. */ if (ctx) - return ctx; + break; } + spin_unlock_irqrestore(&cstate->work_lock, flags); + return ctx; } static int sha512_mb_init(struct ahash_request *areq) @@ -439,11 +458,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(&rctx->areq); kernel_fpu_begin(); - sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, + sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data, nbytes, flag); if (!sha_ctx) { if (flush) - sha_ctx = sha512_ctx_mgr_flush(cstate->mgr); + sha_ctx = sha512_ctx_mgr_flush(cstate); } kernel_fpu_end(); if (sha_ctx) @@ -471,11 +490,12 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, struct sha512_hash_ctx *sha_ctx; struct mcryptd_hash_request_ctx *req_ctx; int ret; + unsigned long flags; /* remove from work list */ - spin_lock(&cstate->work_lock); + spin_lock_irqsave(&cstate->work_lock, flags); list_del(&rctx->waiter); - spin_unlock(&cstate->work_lock); + spin_unlock_irqrestore(&cstate->work_lock, flags); if (irqs_disabled()) rctx->complete(&req->base, err); @@ -486,14 +506,14 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, } /* check to see if there are other jobs that are done */ - sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate); while (sha_ctx) { req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); ret = sha_finish_walk(&req_ctx, cstate, false); if (req_ctx) { - spin_lock(&cstate->work_lock); + spin_lock_irqsave(&cstate->work_lock, flags); list_del(&req_ctx->waiter); - spin_unlock(&cstate->work_lock); + spin_unlock_irqrestore(&cstate->work_lock, flags); req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) @@ -504,7 +524,7 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, local_bh_enable(); } } - sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr); + sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate); } return 0; @@ -515,6 +535,7 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx, { unsigned long next_flush; unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + unsigned long flags; /* initialize tag */ rctx->tag.arrival = jiffies; /* tag the arrival time */ @@ -522,9 +543,9 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx, next_flush = rctx->tag.arrival + delay; rctx->tag.expire = next_flush; - spin_lock(&cstate->work_lock); + spin_lock_irqsave(&cstate->work_lock, flags); list_add_tail(&rctx->waiter, &cstate->work_list); - spin_unlock(&cstate->work_lock); + spin_unlock_irqrestore(&cstate->work_lock, flags); mcryptd_arm_flusher(cstate, delay); } @@ -565,7 +586,7 @@ static int sha512_mb_update(struct ahash_request *areq) sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq); sha512_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE); kernel_fpu_end(); @@ -628,7 +649,7 @@ static int sha512_mb_finup(struct ahash_request *areq) sha512_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data, nbytes, flag); kernel_fpu_end(); @@ -677,8 +698,7 @@ static int sha512_mb_final(struct ahash_request *areq) /* flag HASH_FINAL and 0 data size */ sha512_mb_add_list(rctx, cstate); kernel_fpu_begin(); - sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, - HASH_LAST); + sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST); kernel_fpu_end(); /* check if anything is returned */ @@ -940,7 +960,7 @@ static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate) break; kernel_fpu_begin(); sha_ctx = (struct sha512_hash_ctx *) - sha512_ctx_mgr_flush(cstate->mgr); + sha512_ctx_mgr_flush(cstate); kernel_fpu_end(); if (!sha_ctx) { pr_err("sha512_mb error: nothing got flushed for" diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h index 9d4b2c8208d5..e4653f5eec3f 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -119,7 +119,7 @@ struct sha512_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S index 3ddba19a0db6..7c629caebc05 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S @@ -280,12 +280,18 @@ ENTRY(sha512_mb_mgr_get_comp_job_avx2) pop %rbx ret ENDPROC(sha512_mb_mgr_get_comp_job_avx2) -.data -.align 16 +.section .rodata.cst8.one, "aM", @progbits, 8 +.align 8 one: .quad 1 + +.section .rodata.cst8.two, "aM", @progbits, 8 +.align 8 two: .quad 2 + +.section .rodata.cst8.three, "aM", @progbits, 8 +.align 8 three: .quad 3 diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S index 815f07bdd1f8..4ba709ba78e5 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S @@ -209,8 +209,9 @@ return_null: xor job_rax, job_rax jmp return ENDPROC(sha512_mb_mgr_submit_avx2) -.data +/* UNUSED? +.section .rodata.cst16, "aM", @progbits, 16 .align 16 H0: .int 0x6a09e667 H1: .int 0xbb67ae85 @@ -220,3 +221,4 @@ H4: .int 0x510e527f H5: .int 0x9b05688c H6: .int 0x1f83d9ab H7: .int 0x5be0cd19 +*/ diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S index 31ab1eff6413..e22e907643a6 100644 --- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S +++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S @@ -361,7 +361,7 @@ Lrounds_16_xx: ret ENDPROC(sha512_x4_avx2) -.data +.section .rodata.K512_4, "a", @progbits .align 64 K512_4: .octa 0x428a2f98d728ae22428a2f98d728ae22,\ @@ -525,5 +525,7 @@ K512_4: .octa 0x6c44198c4a4758176c44198c4a475817,\ 0x6c44198c4a4758176c44198c4a475817 +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 .octa 0x18191a1b1c1d1e1f1011121314151617 diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index e610e29cbc81..66bbd9058a90 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -369,14 +369,17 @@ ENDPROC(sha512_transform_ssse3) ######################################################################## ### Binary Data -.data - +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 .align 16 - # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. XMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 # K[t] used in SHA512 hashing K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index dc66273e610d..b3f49d286348 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -29,11 +29,13 @@ .file "twofish-avx-x86_64-asm_64.S" -.data +.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 .align 16 - .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 +.align 16 .Lxts_gf128mul_and_shl1_mask: .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9a9e5884066c..05ed3d393da7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -90,8 +90,8 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - addq $-(15*8+\addskip), %rsp + .macro ALLOC_PT_GPREGS_ON_STACK + addq $-(15*8), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 @@ -147,15 +147,6 @@ For 32-bit we have the following conventions - kernel is built with movq 5*8+\offset(%rsp), %rbx .endm - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 @@ -201,6 +192,26 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +#ifdef CONFIG_FRAME_POINTER + .if \ptregs_offset + leaq \ptregs_offset(%rsp), %rbp + .else + mov %rsp, %rbp + .endif + orq $0x1, %rbp +#endif +.endm + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bdd9cc59d20f..370c42c7f046 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -25,7 +26,7 @@ #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cpufeature.h> #define CREATE_TRACE_POINTS diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21b352a11b49..57f7ec35216e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,6 +45,7 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> +#include <asm/frame.h> .section .entry.text, "ax" @@ -175,6 +176,22 @@ SET_KERNEL_GS %edx .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original rbp. + */ +.macro ENCODE_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER + mov %esp, %ebp + orl $0x1, %ebp +#endif +.endm + .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -245,6 +262,12 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) + FRAME_BEGIN /* help unwinder find end of stack */ + + /* + * schedule_tail() is asmlinkage so we have to put its 'prev' argument + * on the stack. + */ pushl %eax call schedule_tail popl %eax @@ -254,8 +277,9 @@ ENTRY(ret_from_fork) 2: /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax + leal FRAME_OFFSET(%esp), %eax call syscall_return_slowpath + FRAME_END jmp restore_all /* kernel thread */ @@ -307,13 +331,13 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: +.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq - jmp need_resched + jmp .Lneed_resched END(resume_kernel) #endif @@ -334,7 +358,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) */ ENTRY(xen_sysenter_target) addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + jmp .Lsysenter_past_esp #endif /* @@ -371,7 +395,7 @@ ENTRY(xen_sysenter_target) */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp -sysenter_past_esp: +.Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ @@ -504,9 +528,9 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET -restore_all_notrace: +.Lrestore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* @@ -518,22 +542,23 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + je .Lldt_ss # returning to user-space with LDT SS #endif -restore_nocheck: +.Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: +.Lirq_return: INTERRUPT_RETURN + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error - jmp error_code + jmp common_exception .previous - _ASM_EXTABLE(irq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 -ldt_ss: +.Lldt_ss: /* * Setup and switch to ESPFIX stack * @@ -562,7 +587,7 @@ ldt_ss: */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + jmp .Lrestore_nocheck #endif ENDPROC(entry_INT80_32) @@ -624,6 +649,7 @@ common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax call do_IRQ @@ -635,6 +661,7 @@ ENTRY(name) \ ASM_CLAC; \ pushl $~(nr); \ SAVE_ALL; \ + ENCODE_FRAME_POINTER; \ TRACE_IRQS_OFF \ movl %esp, %eax; \ call fn; \ @@ -659,7 +686,7 @@ ENTRY(coprocessor_error) ASM_CLAC pushl $0 pushl $do_coprocessor_error - jmp error_code + jmp common_exception END(coprocessor_error) ENTRY(simd_coprocessor_error) @@ -673,14 +700,14 @@ ENTRY(simd_coprocessor_error) #else pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp common_exception END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC pushl $-1 # mark this as an int pushl $do_device_not_available - jmp error_code + jmp common_exception END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -694,59 +721,59 @@ ENTRY(overflow) ASM_CLAC pushl $0 pushl $do_overflow - jmp error_code + jmp common_exception END(overflow) ENTRY(bounds) ASM_CLAC pushl $0 pushl $do_bounds - jmp error_code + jmp common_exception END(bounds) ENTRY(invalid_op) ASM_CLAC pushl $0 pushl $do_invalid_op - jmp error_code + jmp common_exception END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC pushl $0 pushl $do_coprocessor_segment_overrun - jmp error_code + jmp common_exception END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS - jmp error_code + jmp common_exception END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC pushl $do_segment_not_present - jmp error_code + jmp common_exception END(segment_not_present) ENTRY(stack_segment) ASM_CLAC pushl $do_stack_segment - jmp error_code + jmp common_exception END(stack_segment) ENTRY(alignment_check) ASM_CLAC pushl $do_alignment_check - jmp error_code + jmp common_exception END(alignment_check) ENTRY(divide_error) ASM_CLAC pushl $0 # no error code pushl $do_divide_error - jmp error_code + jmp common_exception END(divide_error) #ifdef CONFIG_X86_MCE @@ -754,7 +781,7 @@ ENTRY(machine_check) ASM_CLAC pushl $0 pushl machine_check_vector - jmp error_code + jmp common_exception END(machine_check) #endif @@ -762,13 +789,14 @@ ENTRY(spurious_interrupt_bug) ASM_CLAC pushl $0 pushl $do_spurious_interrupt_bug - jmp error_code + jmp common_exception END(spurious_interrupt_bug) #ifdef CONFIG_XEN ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF /* @@ -823,6 +851,7 @@ ENTRY(xen_failsafe_callback) jmp iret_exc 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER jmp ret_from_exception .section .fixup, "ax" @@ -882,15 +911,15 @@ ftrace_call: popl %edx popl %ecx popl %eax -ftrace_ret: +.Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: jmp ftrace_stub #endif -.globl ftrace_stub -ftrace_stub: +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) ret END(ftrace_caller) @@ -952,7 +981,7 @@ GLOBAL(ftrace_regs_call) popl %gs addl $8, %esp /* Skip orig_ax and ip */ popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + jmp .Lftrace_ret popf jmp ftrace_stub @@ -963,7 +992,7 @@ ENTRY(mcount) jb ftrace_stub /* Paging not enabled yet? */ cmpl $ftrace_stub, ftrace_trace_function - jnz trace + jnz .Ltrace #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -976,7 +1005,7 @@ ftrace_stub: ret /* taken from glibc */ -trace: +.Ltrace: pushl %eax pushl %ecx pushl %edx @@ -1027,7 +1056,7 @@ return_to_handler: ENTRY(trace_page_fault) ASM_CLAC pushl $trace_do_page_fault - jmp error_code + jmp common_exception END(trace_page_fault) #endif @@ -1035,7 +1064,10 @@ ENTRY(page_fault) ASM_CLAC pushl $do_page_fault ALIGN -error_code: + jmp common_exception +END(page_fault) + +common_exception: /* the function address is in %gs's slot on the stack */ pushl %fs pushl %es @@ -1047,6 +1079,7 @@ error_code: pushl %edx pushl %ecx pushl %ebx + ENCODE_FRAME_POINTER cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1064,7 +1097,7 @@ error_code: movl %esp, %eax # pt_regs pointer call *%edi jmp ret_from_exception -END(page_fault) +END(common_exception) ENTRY(debug) /* @@ -1079,6 +1112,7 @@ ENTRY(debug) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer @@ -1094,11 +1128,11 @@ ENTRY(debug) .Ldebug_from_sysenter_stack: /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp TRACE_IRQS_OFF call do_debug - movl %ebp, %esp + movl %ebx, %esp jmp ret_from_exception END(debug) @@ -1116,11 +1150,12 @@ ENTRY(nmi) movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax - je nmi_espfix_stack + je .Lnmi_espfix_stack #endif pushl %eax # pt_regs->orig_ax SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1132,21 +1167,21 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp restore_all_notrace + jmp .Lrestore_all_notrace .Lnmi_from_sysenter_stack: /* * We're on the SYSENTER stack. Switch off. No one (not even debug) * is using the thread stack right now, so it's safe for us to use it. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi - movl %ebp, %esp - jmp restore_all_notrace + movl %ebx, %esp + jmp .Lrestore_all_notrace #ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: +.Lnmi_espfix_stack: /* * create the pointer to lss back */ @@ -1159,12 +1194,13 @@ nmi_espfix_stack: .endr pushl %eax SAVE_ALL + ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + jmp .Lirq_return #endif END(nmi) @@ -1172,6 +1208,7 @@ ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1181,14 +1218,14 @@ END(int3) ENTRY(general_protection) pushl $do_general_protection - jmp error_code + jmp common_exception END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp error_code + jmp common_exception END(async_page_fault) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a358b37..044d18ebc43c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,14 +36,9 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - .code64 .section .entry.text, "ax" @@ -414,17 +409,19 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - movq %rsp, %rdi + leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS + FRAME_END jmp restore_regs_and_iret 1: @@ -469,6 +466,7 @@ END(irq_entries_start) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER testb $3, CS(%rsp) jz 1f @@ -985,6 +983,7 @@ ENTRY(xen_failsafe_callback) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) @@ -1028,6 +1027,7 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx rdmsr @@ -1075,6 +1075,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1257,6 +1258,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + ENCODE_FRAME_POINTER /* * At this point we no longer need to worry about stack damage @@ -1270,11 +1272,10 @@ ENTRY(nmi) /* * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. Fortunately, - * do_nmi doesn't modify pt_regs. + * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2b3618542544..9ba050fe47f3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -389,3 +389,4 @@ 380 i386 pkey_mprotect sys_pkey_mprotect 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free +383 i386 statx sys_statx diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index e93ef0b38db8..5aef183e2f85 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -338,6 +338,7 @@ 329 common pkey_mprotect sys_pkey_mprotect 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free +332 common statx sys_statx # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 02223cb4bcfd..9d4d6e138311 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -92,10 +92,10 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } -static notrace cycle_t vread_pvclock(int *mode) +static notrace u64 vread_pvclock(int *mode) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - cycle_t ret; + u64 ret; u64 last; u32 version; @@ -142,9 +142,9 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif -notrace static cycle_t vread_tsc(void) +notrace static u64 vread_tsc(void) { - cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 ret = (u64)rdtsc_ordered(); u64 last = gtod->cycle_last; if (likely(ret >= last)) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 23c881caabd1..226ca70dc6bd 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/random.h> @@ -109,7 +110,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +118,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } @@ -161,8 +162,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -188,15 +187,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); - do_munmap(mm, text_start, image->size); + do_munmap(mm, text_start, image->size, NULL); + } else { + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; } up_fail: - if (ret) { - current->mm->context.vdso = NULL; - current->mm->context.vdso_image = NULL; - } - up_write(&mm->mmap_sem); return ret; } @@ -375,7 +372,7 @@ static int __init init_vdso(void) /* notifier priority > KVM */ return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "AP_X86_VDSO_VMA_ONLINE", vgetcpu_online, NULL); + "x86/vdso/vma:online", vgetcpu_online, NULL); } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 636c4b341f36..ce1d7534fa53 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -27,6 +27,8 @@ #include <linux/kernel.h> #include <linux/timer.h> +#include <linux/sched/signal.h> +#include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 1d392c39fe56..b8ccdb5c9244 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,11 +1,4 @@ -obj-y += core.o - -obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o -obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o -obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o -ifdef CONFIG_AMD_IOMMU -obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o -endif - -obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-y += core.o +obj-y += amd/ +obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile new file mode 100644 index 000000000000..b1da46f396e0 --- /dev/null +++ b/arch/x86/events/amd/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o +obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += iommu.o +endif + diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index b26ee32f73e8..786fd875de92 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/ptrace.h> #include <linux/syscore_ops.h> +#include <linux/sched/clock.h> #include <asm/apic.h> @@ -1010,7 +1011,7 @@ static __init int amd_ibs_init(void) * all online cpus. */ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - "AP_PERF_X86_AMD_IBS_STARTING", + "perf/x86/amd/ibs:starting", x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 9842270ed2f2..a6eee5ac4f58 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -291,7 +291,7 @@ static int __init amd_power_pmu_init(void) cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, - "AP_PERF_X86_AMD_POWER_ONLINE", + "perf/x86/amd/power:online", power_cpu_init, power_cpu_exit); ret = perf_pmu_register(&pmu_class, "power", -1); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 65577f081d07..4d1f7f2d9aff 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -22,13 +22,17 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 -#define MAX_COUNTERS NUM_COUNTERS_NB +#define NUM_COUNTERS_L3 6 +#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 -#define RDPMC_BASE_L2 10 +#define RDPMC_BASE_LLC 10 #define COUNTER_SHIFT 16 +static int num_counters_llc; +static int num_counters_nb; + static HLIST_HEAD(uncore_unused_list); struct amd_uncore { @@ -45,30 +49,30 @@ struct amd_uncore { }; static struct amd_uncore * __percpu *amd_uncore_nb; -static struct amd_uncore * __percpu *amd_uncore_l2; +static struct amd_uncore * __percpu *amd_uncore_llc; static struct pmu amd_nb_pmu; -static struct pmu amd_l2_pmu; +static struct pmu amd_llc_pmu; static cpumask_t amd_nb_active_mask; -static cpumask_t amd_l2_active_mask; +static cpumask_t amd_llc_active_mask; static bool is_nb_event(struct perf_event *event) { return event->pmu->type == amd_nb_pmu.type; } -static bool is_l2_event(struct perf_event *event) +static bool is_llc_event(struct perf_event *event) { - return event->pmu->type == amd_l2_pmu.type; + return event->pmu->type == amd_llc_pmu.type; } static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) { if (is_nb_event(event) && amd_uncore_nb) return *per_cpu_ptr(amd_uncore_nb, event->cpu); - else if (is_l2_event(event) && amd_uncore_l2) - return *per_cpu_ptr(amd_uncore_l2, event->cpu); + else if (is_llc_event(event) && amd_uncore_llc) + return *per_cpu_ptr(amd_uncore_llc, event->cpu); return NULL; } @@ -183,16 +187,16 @@ static int amd_uncore_event_init(struct perf_event *event) return -ENOENT; /* - * NB and L2 counters (MSRs) are shared across all cores that share the - * same NB / L2 cache. Interrupts can be directed to a single target - * core, however, event counts generated by processes running on other - * cores cannot be masked out. So we do not support sampling and - * per-thread events. + * NB and Last level cache counters (MSRs) are shared across all cores + * that share the same NB / Last level cache. Interrupts can be directed + * to a single target core, however, event counts generated by processes + * running on other cores cannot be masked out. So we do not support + * sampling and per-thread events. */ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - /* NB and L2 counters do not have usr/os/guest/host bits */ + /* NB and Last level cache counters do not have usr/os/guest/host bits */ if (event->attr.exclude_user || event->attr.exclude_kernel || event->attr.exclude_host || event->attr.exclude_guest) return -EINVAL; @@ -226,8 +230,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, if (pmu->type == amd_nb_pmu.type) active_mask = &amd_nb_active_mask; - else if (pmu->type == amd_l2_pmu.type) - active_mask = &amd_l2_active_mask; + else if (pmu->type == amd_llc_pmu.type) + active_mask = &amd_llc_active_mask; else return 0; @@ -244,30 +248,47 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -PMU_FORMAT_ATTR(event, "config:0-7,32-35"); -PMU_FORMAT_ATTR(umask, "config:8-15"); - -static struct attribute *amd_uncore_format_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - NULL, -}; - -static struct attribute_group amd_uncore_format_group = { - .name = "format", - .attrs = amd_uncore_format_attr, +/* + * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based + * on family + */ +#define AMD_FORMAT_ATTR(_dev, _name, _format) \ +static ssize_t \ +_dev##_show##_name(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); + +/* Used for each uncore counter type */ +#define AMD_ATTRIBUTE(_name) \ +static struct attribute *amd_uncore_format_attr_##_name[] = { \ + &format_attr_event_##_name.attr, \ + &format_attr_umask.attr, \ + NULL, \ +}; \ +static struct attribute_group amd_uncore_format_group_##_name = { \ + .name = "format", \ + .attrs = amd_uncore_format_attr_##_name, \ +}; \ +static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ + &amd_uncore_attr_group, \ + &amd_uncore_format_group_##_name, \ + NULL, \ }; -static const struct attribute_group *amd_uncore_attr_groups[] = { - &amd_uncore_attr_group, - &amd_uncore_format_group, - NULL, -}; +AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); +AMD_FORMAT_ATTR(umask, , "config:8-15"); +AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); +AMD_FORMAT_ATTR(event, _l3, "config:0-7"); +AMD_ATTRIBUTE(df); +AMD_ATTRIBUTE(l3); static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -276,10 +297,8 @@ static struct pmu amd_nb_pmu = { .read = amd_uncore_read, }; -static struct pmu amd_l2_pmu = { +static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, - .attr_groups = amd_uncore_attr_groups, - .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -296,14 +315,14 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_l2; + struct amd_uncore *uncore_nb = NULL, *uncore_llc; if (amd_uncore_nb) { uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; uncore_nb->cpu = cpu; - uncore_nb->num_counters = NUM_COUNTERS_NB; + uncore_nb->num_counters = num_counters_nb; uncore_nb->rdpmc_base = RDPMC_BASE_NB; uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; @@ -312,18 +331,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } - if (amd_uncore_l2) { - uncore_l2 = amd_uncore_alloc(cpu); - if (!uncore_l2) + if (amd_uncore_llc) { + uncore_llc = amd_uncore_alloc(cpu); + if (!uncore_llc) goto fail; - uncore_l2->cpu = cpu; - uncore_l2->num_counters = NUM_COUNTERS_L2; - uncore_l2->rdpmc_base = RDPMC_BASE_L2; - uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore_l2->active_mask = &amd_l2_active_mask; - uncore_l2->pmu = &amd_l2_pmu; - uncore_l2->id = -1; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; + uncore_llc->cpu = cpu; + uncore_llc->num_counters = num_counters_llc; + uncore_llc->rdpmc_base = RDPMC_BASE_LLC; + uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_llc->active_mask = &amd_llc_active_mask; + uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->id = -1; + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } return 0; @@ -376,17 +395,17 @@ static int amd_uncore_cpu_starting(unsigned int cpu) *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; } - if (amd_uncore_l2) { + if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; unsigned int nshared; - uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + uncore = *per_cpu_ptr(amd_uncore_llc, cpu); cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); nshared = ((eax >> 14) & 0xfff) + 1; uncore->id = apicid - (apicid % nshared); - uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; } return 0; @@ -419,8 +438,8 @@ static int amd_uncore_cpu_online(unsigned int cpu) if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_online(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_online(cpu, amd_uncore_llc); return 0; } @@ -456,8 +475,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu) if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_down_prepare(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_down_prepare(cpu, amd_uncore_llc); return 0; } @@ -479,8 +498,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu) if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); - if (amd_uncore_l2) - uncore_dead(cpu, amd_uncore_l2); + if (amd_uncore_llc) + uncore_dead(cpu, amd_uncore_llc); return 0; } @@ -492,6 +511,47 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) goto fail_nodev; + switch(boot_cpu_data.x86) { + case 23: + /* Family 17h: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L3; + /* + * For Family17h, the NorthBridge counters are + * re-purposed as Data Fabric counters. Also, support is + * added for L3 counters. The pmus are exported based on + * family as either L2 or L3 and NB or DF. + */ + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + format_attr_event_df.show = &event_show_df; + format_attr_event_l3.show = &event_show_l3; + break; + case 22: + /* Family 16h - may change: */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + default: + /* + * All prior families have the same number of + * NorthBridge and Last Level Cache counters + */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + break; + } + amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; + amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) goto fail_nodev; @@ -510,16 +570,16 @@ static int __init amd_uncore_init(void) } if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { - amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - if (!amd_uncore_l2) { + amd_uncore_llc = alloc_percpu(struct amd_uncore *); + if (!amd_uncore_llc) { ret = -ENOMEM; - goto fail_l2; + goto fail_llc; } - ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1); if (ret) - goto fail_l2; + goto fail_llc; - pr_info("perf: AMD L2I counters detected\n"); + pr_info("perf: AMD LLC counters detected\n"); ret = 0; } @@ -527,16 +587,16 @@ static int __init amd_uncore_init(void) * Install callbacks. Core will call them for each online cpu. */ if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, - "PERF_X86_AMD_UNCORE_PREP", + "perf/x86/amd/uncore:prepare", amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) - goto fail_l2; + goto fail_llc; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, - "AP_PERF_X86_AMD_UNCORE_STARTING", + "perf/x86/amd/uncore:starting", amd_uncore_cpu_starting, NULL)) goto fail_prep; if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, - "AP_PERF_X86_AMD_UNCORE_ONLINE", + "perf/x86/amd/uncore:online", amd_uncore_cpu_online, amd_uncore_cpu_down_prepare)) goto fail_start; @@ -546,11 +606,11 @@ fail_start: cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); fail_prep: cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); -fail_l2: +fail_llc: if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); - if (amd_uncore_l2) - free_percpu(amd_uncore_l2); + if (amd_uncore_llc) + free_percpu(amd_uncore_llc); fail_nb: if (amd_uncore_nb) free_percpu(amd_uncore_nb); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6e395c996900..349d4d17aa7f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -20,7 +20,8 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> @@ -365,7 +366,11 @@ int x86_add_exclusive(unsigned int what) { int i; - if (x86_pmu.lbr_pt_coexist) + /* + * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. + * LBR and BTS are still mutually exclusive. + */ + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return 0; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { @@ -388,7 +393,7 @@ fail_unlock: void x86_del_exclusive(unsigned int what) { - if (x86_pmu.lbr_pt_coexist) + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); @@ -501,6 +506,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with @@ -1816,18 +1825,18 @@ static int __init init_hw_perf_events(void) * Install callbacks. Core will call them for each online * cpu. */ - err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "PERF_X86_PREPARE", + err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, - "AP_PERF_X86_STARTING", x86_pmu_starting_cpu, + "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; - err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "AP_PERF_X86_ONLINE", + err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; @@ -2299,7 +2308,7 @@ valid_user_frame(const void __user *fp, unsigned long size) static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; - int idx = segment >> 3; + unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cb8522290e6a..eb1484c86bb4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2110,6 +2110,27 @@ again: GLOBAL_STATUS_LBRS_FROZEN); if (!status) goto done; + /* + * In case multiple PEBS events are sampled at the same time, + * it is possible to have GLOBAL_STATUS bit 62 set indicating + * PEBS buffer overflow and also seeing at most 3 PEBS counters + * having their bits set in the status register. This is a sign + * that there was at least one PEBS record pending at the time + * of the PMU interrupt. PEBS counters must only be processed + * via the drain_pebs() calls and not via the regular sample + * processing loop coming after that the function, otherwise + * phony regular samples may be generated in the sampling buffer + * not marked with the EXACT tag. Another possibility is to have + * one PEBS event and at least one non-PEBS event whic hoverflows + * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will + * not be set, yet the overflow status bit for the PEBS counter will + * be on Skylake. + * + * To avoid this problem, we systematically ignore the PEBS-enabled + * counters from the GLOBAL_STATUS mask and we always process PEBS + * events via drain_pebs(). + */ + status &= ~cpuc->pebs_enabled; /* * PEBS overflow sets bit 62 in the global status register @@ -2117,15 +2138,6 @@ again: if (__test_and_clear_bit(62, (unsigned long *)&status)) { handled++; x86_pmu.drain_pebs(regs); - /* - * There are cases where, even though, the PEBS ovfl bit is set - * in GLOBAL_OVF_STATUS, the PEBS events may also have their - * overflow bits set for their counters. We must clear them - * here because they have been processed as exact samples in - * the drain_pebs() routine. They must not be processed again - * in the for_each_bit_set() loop for regular samples below. - */ - status &= ~cpuc->pebs_enabled; status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; } @@ -3164,13 +3176,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { for_each_cpu(i, topology_sibling_cpumask(cpu)) { + struct cpu_hw_events *sibling; struct intel_excl_cntrs *c; - c = per_cpu(cpu_hw_events, i).excl_cntrs; + sibling = &per_cpu(cpu_hw_events, i); + c = sibling->excl_cntrs; if (c && c->core_id == core_id) { cpuc->kfree_on_online[1] = cpuc->excl_cntrs; cpuc->excl_cntrs = c; - cpuc->excl_thread_id = 1; + if (!sibling->excl_thread_id) + cpuc->excl_thread_id = 1; break; } } @@ -3975,7 +3990,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8f82b02934fa..8c00dc09a5d2 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -7,9 +7,9 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <asm/cpu_device_id.h> +#include <asm/intel_rdt_common.h> #include "../perf_event.h" -#define MSR_IA32_PQR_ASSOC 0x0c8f #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d @@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; unsigned int mbm_socket_max; -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - /* * The cached intel_pqr_state is strictly per CPU and can never be * updated from a remote CPU. Both functions which modify the state * (intel_cqm_event_start and intel_cqm_event_stop) are called with * interrupts disabled, which is sufficient for the protection. */ -static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data @@ -1766,9 +1747,9 @@ static int __init intel_cqm_init(void) * is enabled to avoid notifier leak. */ cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "AP_PERF_X86_CQM_STARTING", + "perf/x86/cqm:starting", intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "AP_PERF_X86_CQM_ONLINE", + cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", NULL, intel_cqm_cpu_exit); out: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index da51e5a3e2ff..aff4b5b69d40 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { @@ -539,6 +541,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), { }, @@ -594,6 +599,9 @@ static int __init cstate_probe(const struct cstate_model *cm) static inline void cstate_cleanup(void) { + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); + if (has_cstate_core) perf_pmu_unregister(&cstate_core_pmu); @@ -606,16 +614,16 @@ static int __init cstate_init(void) int err; cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING, - "AP_PERF_X86_CSTATE_STARTING", cstate_cpu_init, - NULL); + "perf/x86/cstate:starting", cstate_cpu_init, NULL); cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE, - "AP_PERF_X86_CSTATE_ONLINE", NULL, cstate_cpu_exit); + "perf/x86/cstate:online", NULL, cstate_cpu_exit); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); if (err) { has_cstate_core = false; pr_info("Failed to register cstate core pmu\n"); + cstate_cleanup(); return err; } } @@ -629,8 +637,7 @@ static int __init cstate_init(void) return err; } } - - return err; + return 0; } static int __init cstate_pmu_init(void) @@ -655,8 +662,6 @@ module_init(cstate_pmu_init); static void __exit cstate_pmu_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); cstate_cleanup(); } module_exit(cstate_pmu_exit); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index be202390bbd3..9dfeeeca0ea8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; /* log dropped samples number */ - if (error[bit]) + if (error[bit]) { perf_log_lost_samples(event, error[bit]); + if (perf_event_account_interrupt(event)) + x86_pmu_stop(event, 0); + } + if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c5047b8f777b..5900471ee508 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -36,13 +36,6 @@ static DEFINE_PER_CPU(struct pt, pt_ctx); static struct pt_pmu pt_pmu; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - /* * Capabilities of Intel PT hardware, such as number of address bits or * supported output schemes, are cached and exported to userspace as "caps" @@ -64,21 +57,21 @@ static struct pt_cap_desc { u8 reg; u32 mask; } pt_caps[] = { - PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), - PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), - PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), - PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), - PT_CAP(mtc, 0, CR_EBX, BIT(3)), - PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), - PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), - PT_CAP(topa_output, 0, CR_ECX, BIT(0)), - PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), - PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), - PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), - PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), - PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), - PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), + PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), + PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), + PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) @@ -106,18 +99,24 @@ static struct attribute_group pt_cap_group = { }; PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(pwr_evt, "config:4" ); +PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { &format_attr_cyc.attr, + &format_attr_pwr_evt.attr, + &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_ptw.attr, &format_attr_mtc_period.attr, &format_attr_cyc_thresh.attr, &format_attr_psb_period.attr, @@ -213,10 +212,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); + &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 0a535cea8ff3..22054ca49026 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,20 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } + /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +620,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -697,6 +698,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } @@ -769,6 +771,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), {}, }; @@ -802,29 +807,21 @@ static int __init rapl_pmu_init(void) /* * Install callbacks. Core will call them for each online cpu. */ - - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "PERF_X86_RAPL_PREP", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, - "AP_PERF_X86_RAPL_ONLINE", + "perf/x86/rapl:online", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -835,7 +832,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index dbaaf7dc8373..758c1aa5009d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -100,7 +100,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - return pmu->boxes[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -733,6 +739,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; @@ -763,30 +770,6 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) -{ - struct intel_uncore_pmu *pmu = type->pmus; - struct intel_uncore_box *box; - int i, pkg; - - if (pmu) { - pkg = topology_physical_package_id(cpu); - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box) - uncore_box_exit(box); - } - } -} - -static void uncore_exit_boxes(void *dummy) -{ - struct intel_uncore_type **types; - - for (types = uncore_msr_uncores; *types; types++) - __uncore_exit_boxes(*types++, smp_processor_id()); -} - static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int pkg; @@ -1057,86 +1040,6 @@ static void uncore_pci_exit(void) } } -static int uncore_cpu_dying(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } - return 0; -} - -static int first_init; - -static int uncore_cpu_starting(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg, ncpus = 1; - - if (first_init) { - /* - * On init we get the number of online cpus in the package - * and set refcount for all of them. - */ - ncpus = cpumask_weight(topology_core_cpumask(cpu)); - } - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (!box) - continue; - /* The first cpu on a package activates the box */ - if (atomic_add_return(ncpus, &box->refcnt) == ncpus) - uncore_box_init(box); - } - } - - return 0; -} - -static int uncore_cpu_prepare(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) - continue; - /* First cpu of a package allocates the box */ - box = uncore_alloc_box(type, cpu_to_node(cpu)); - if (!box) - return -ENOMEM; - box->pmu = pmu; - box->pkgid = pkg; - pmu->boxes[pkg] = box; - } - } - return 0; -} - static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, int new_cpu) { @@ -1176,12 +1079,14 @@ static void uncore_change_context(struct intel_uncore_type **uncores, static int uncore_event_cpu_offline(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, pkg, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return 0; - + goto unref; /* Find a new cpu to collect uncore events */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); @@ -1193,12 +1098,82 @@ static int uncore_event_cpu_offline(unsigned int cpu) uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); + +unref: + /* Clear the references */ + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } return 0; } +static int allocate_boxes(struct intel_uncore_type **types, + unsigned int pkg, unsigned int cpu) +{ + struct intel_uncore_box *box, *tmp; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + LIST_HEAD(allocated); + int i; + + /* Try to allocate all required boxes */ + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + if (pmu->boxes[pkg]) + continue; + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + goto cleanup; + box->pmu = pmu; + box->pkgid = pkg; + list_add(&box->active_list, &allocated); + } + } + /* Install them in the pmus */ + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + box->pmu->boxes[pkg] = box; + } + return 0; + +cleanup: + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + kfree(box); + } + return -ENOMEM; +} + static int uncore_event_cpu_online(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, ret, pkg, target; + + pkg = topology_logical_package_id(cpu); + ret = allocate_boxes(types, pkg, cpu); + if (ret) + return ret; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box && atomic_inc_return(&box->refcnt) == 1) + uncore_box_init(box); + } + } /* * Check if there is an online cpu in the package @@ -1353,6 +1328,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), {}, }; @@ -1388,38 +1365,16 @@ static int __init intel_uncore_init(void) if (cret && pret) return -ENODEV; - /* - * Install callbacks. Core will call them for each online cpu. - * - * The first online cpu of each package allocates and takes - * the refcounts for all other online cpus in that package. - * If msrs are not enabled no allocation is required and - * uncore_cpu_prepare() is not called for each online cpu. - */ - if (!cret) { - ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP, - "PERF_X86_UNCORE_PREP", - uncore_cpu_prepare, NULL); - if (ret) - goto err; - } else { - cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP, - "PERF_X86_UNCORE_PREP", - uncore_cpu_prepare, NULL); - } - first_init = 1; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING, - "AP_PERF_X86_UNCORE_STARTING", - uncore_cpu_starting, uncore_cpu_dying); - first_init = 0; - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, - "AP_PERF_X86_UNCORE_ONLINE", - uncore_event_cpu_online, uncore_event_cpu_offline); + /* Install hotplug callbacks to setup the targets for each package */ + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "perf/x86/intel/uncore:online", + uncore_event_cpu_online, + uncore_event_cpu_offline); + if (ret) + goto err; return 0; err: - /* Undo box->init_box() */ - on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); return ret; @@ -1428,9 +1383,7 @@ module_init(intel_uncore_init); static void __exit intel_uncore_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 272427700d48..dae2fedc1601 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -669,7 +669,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), UNCORE_EVENT_CONSTRAINT(0x21, 0x3), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), UNCORE_EVENT_CONSTRAINT(0x31, 0x3), @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a77ee026643d..bcbb1d2ae10b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -604,7 +604,7 @@ struct x86_pmu { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ - bool lbr_pt_coexist; /* LBR may coexist with PT */ + bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ /* * Intel PT/LBR/BTS are exclusive diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile new file mode 100644 index 000000000000..171ae09864d7 --- /dev/null +++ b/arch/x86/hyperv/Makefile @@ -0,0 +1 @@ +obj-y := hv_init.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c new file mode 100644 index 000000000000..db64baf0e500 --- /dev/null +++ b/arch/x86/hyperv/hv_init.c @@ -0,0 +1,277 @@ +/* + * X86 specific Hyper-V initialization code. + * + * Copyright (C) 2016, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#include <linux/types.h> +#include <asm/hypervisor.h> +#include <asm/hyperv.h> +#include <asm/mshyperv.h> +#include <linux/version.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> + + +#ifdef CONFIG_X86_64 + +static struct ms_hyperv_tsc_page *tsc_pg; + +static u64 read_hv_clock_tsc(struct clocksource *arg) +{ + u64 current_tick; + + if (tsc_pg->tsc_sequence != 0) { + /* + * Use the tsc page to compute the value. + */ + + while (1) { + u64 tmp; + u32 sequence = tsc_pg->tsc_sequence; + u64 cur_tsc; + u64 scale = tsc_pg->tsc_scale; + s64 offset = tsc_pg->tsc_offset; + + rdtscll(cur_tsc); + /* current_tick = ((cur_tsc *scale) >> 64) + offset */ + asm("mulq %3" + : "=d" (current_tick), "=a" (tmp) + : "a" (cur_tsc), "r" (scale)); + + current_tick += offset; + if (tsc_pg->tsc_sequence == sequence) + return current_tick; + + if (tsc_pg->tsc_sequence != 0) + continue; + /* + * Fallback using MSR method. + */ + break; + } + } + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs_tsc = { + .name = "hyperv_clocksource_tsc_page", + .rating = 400, + .read = read_hv_clock_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; +#endif + +static u64 read_hv_clock_msr(struct clocksource *arg) +{ + u64 current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs_msr = { + .name = "hyperv_clocksource_msr", + .rating = 400, + .read = read_hv_clock_msr, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void *hypercall_pg; +struct clocksource *hyperv_cs; +EXPORT_SYMBOL_GPL(hyperv_cs); + +/* + * This function is to be invoked early in the boot sequence after the + * hypervisor has been detected. + * + * 1. Setup the hypercall page. + * 2. Register Hyper-V specific clocksource. + */ +void hyperv_init(void) +{ + u64 guest_id; + union hv_x64_msr_hypercall_contents hypercall_msr; + + if (x86_hyper != &x86_hyper_ms_hyperv) + return; + + /* + * Setup the hypercall page and enable hypercalls. + * 1. Register the guest ID + * 2. Enable the hypercall and register the hypercall page + */ + guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); + wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hypercall_pg == NULL) { + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + return; + } + + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + /* + * Register Hyper-V specific clocksource. + */ +#ifdef CONFIG_X86_64 + if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { + union hv_x64_msr_hypercall_contents tsc_msr; + + tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!tsc_pg) + goto register_msr_cs; + + hyperv_cs = &hyperv_cs_tsc; + + rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + tsc_msr.enable = 1; + tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); + + wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); + return; + } +#endif + /* + * For 32 bit guests just use the MSR based mechanism for reading + * the partition counter. + */ + +register_msr_cs: + hyperv_cs = &hyperv_cs_msr; + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); +} + +/* + * This routine is called before kexec/kdump, it does the required cleanup. + */ +void hyperv_cleanup(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* Reset our OS id */ + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + + /* Reset the hypercall page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + /* Reset the TSC page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); +} +EXPORT_SYMBOL_GPL(hyperv_cleanup); + +/* + * hv_do_hypercall- Invoke the specified hypercall + */ +u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = (input) ? virt_to_phys(input) : 0; + u64 output_address = (output) ? virt_to_phys(output) : 0; +#ifdef CONFIG_X86_64 + u64 hv_status = 0; + + if (!hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); + __asm__ __volatile__("call *%3" : "=a" (hv_status) : + "c" (control), "d" (input_address), + "m" (hypercall_pg)); + + return hv_status; + +#else + + u32 control_hi = control >> 32; + u32 control_lo = control & 0xFFFFFFFF; + u32 hv_status_hi = 1; + u32 hv_status_lo = 1; + u32 input_address_hi = input_address >> 32; + u32 input_address_lo = input_address & 0xFFFFFFFF; + u32 output_address_hi = output_address >> 32; + u32 output_address_lo = output_address & 0xFFFFFFFF; + + if (!hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), + "=a"(hv_status_lo) : "d" (control_hi), + "a" (control_lo), "b" (input_address_hi), + "c" (input_address_lo), "D"(output_address_hi), + "S"(output_address_lo), "m" (hypercall_pg)); + + return hv_status_lo | ((u64)hv_status_hi << 32); +#endif /* !x86_64 */ +} +EXPORT_SYMBOL_GPL(hv_do_hypercall); + +void hyperv_report_panic(struct pt_regs *regs) +{ + static bool panic_reported; + + /* + * We prefer to report panic on 'die' chain as we have proper + * registers to report, but if we miss it (e.g. on BUG()) we need + * to report it on 'panic'. + */ + if (panic_reported) + return; + panic_reported = true; + + wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip); + wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax); + wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx); + wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx); + wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx); + + /* + * Let Hyper-V know there is crash data available + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic); + +bool hv_is_hypercall_page_setup(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* Check if the hypercall page is setup */ + hypercall_msr.as_uint64 = 0; + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + if (!hypercall_msr.enable) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(hv_is_hypercall_page_setup); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index cb26f18d43af..8d0879f1d42c 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -26,8 +26,9 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/perf_event.h> +#include <linux/sched/task_stack.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/user32.h> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cb13c0564ea7..724153797209 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -9,6 +9,7 @@ */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> @@ -20,7 +21,7 @@ #include <linux/compat.h> #include <linux/binfmts.h> #include <asm/ucontext.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 719cd702b0a4..47956c6a4fd8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -42,7 +42,7 @@ #include <linux/slab.h> #include <asm/mman.h> #include <asm/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/atomic.h> #include <asm/vgtod.h> #include <asm/sys_ia32.h> diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 2cfed174e3c9..5d6a53fd7521 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -6,12 +6,7 @@ generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h - generic-y += clkdev.h -generic-y += cputime.h generic-y += dma-contiguous.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index 7a15588e45d4..7d3ece8bfb61 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -17,6 +17,8 @@ #include <linux/user.h> #include <linux/elfcore.h> +#include <linux/mm_types.h> + #include <asm/debugreg.h> /* diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 5391b0ae7cc3..395b69551fce 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (amd_e400_c1e_detected) + else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E)) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 5e828da2e18f..00c88a01301d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -21,6 +21,10 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, unsigned long); +extern int amd_smn_read(u16 node, u32 address, u32 *value); +extern int amd_smn_write(u16 node, u32 address, u32 value); +extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); + struct amd_l3_cache { unsigned indices; u8 subcaches[4]; @@ -55,6 +59,7 @@ struct threshold_bank { }; struct amd_northbridge { + struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; @@ -66,7 +71,6 @@ struct amd_northbridge_info { u64 flags; struct amd_northbridge *nb; }; -extern struct amd_northbridge_info amd_northbridges; #define AMD_NB_GART BIT(0) #define AMD_NB_L3_INDEX_DISABLE BIT(1) @@ -74,20 +78,9 @@ extern struct amd_northbridge_info amd_northbridges; #ifdef CONFIG_AMD_NB -static inline u16 amd_nb_num(void) -{ - return amd_northbridges.num; -} - -static inline bool amd_nb_has_feature(unsigned feature) -{ - return ((amd_northbridges.flags & feature) == feature); -} - -static inline struct amd_northbridge *node_to_amd_nb(int node) -{ - return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; -} +u16 amd_nb_num(void); +bool amd_nb_has_feature(unsigned int feature); +struct amd_northbridge *node_to_amd_nb(int node); static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5aaf6c83222..730ef65e8393 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -2,7 +2,6 @@ #define _ASM_X86_APIC_H #include <linux/cpumask.h> -#include <linux/pm.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -11,7 +10,6 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> -#include <asm/idle.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -196,7 +194,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) static inline void native_apic_msr_eoi_write(u32 reg, u32 v) { - wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } static inline u32 native_apic_msr_read(u32 reg) @@ -332,6 +330,7 @@ struct apic { * on write for EOI. */ void (*eoi_write)(u32 reg, u32 v); + void (*native_eoi_write)(u32 reg, u32 v); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); @@ -639,7 +638,6 @@ extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); - exit_idle(); } static inline void entering_ack_irq(void) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..830b19dbfa31 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include <asm/ftrace.h> +#include <linux/uaccess.h> +#include <asm/string.h> +#include <asm/page.h> +#include <asm/checksum.h> + +#include <asm-generic/asm-prototypes.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/special_insns.h> +#include <asm/preempt.h> + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 68557f52b961..854022772c5b 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -139,6 +139,19 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } +static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +{ + bool negative; + asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + CC_SET(s) + : CC_OUT(s) (negative), ADDR + : "ir" ((char) ~(1 << nr)) : "memory"); + return negative; +} + +// Let everybody know we have it +#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte + /* * __clear_bit_unlock - Clears a bit in memory * @nr: Bit to clear diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 872877d930de..e7e1942edff7 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -90,18 +90,8 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); -#ifdef CONFIG_DEBUG_RODATA_TEST -int rodata_test(void); -#else -static inline int rodata_test(void) -{ - return 0; -} -#endif - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index c020ee75dce7..08e7efb2c140 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -8,7 +8,7 @@ */ #include <linux/compiler.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> /** diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d2b69fc0ceb..d59c15c3defd 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -204,6 +204,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a39629206864..4e7772387c6e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -100,12 +100,12 @@ #define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ @@ -186,13 +186,17 @@ * * Reuse free bits when adding new feature flags! */ - +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ @@ -221,11 +225,13 @@ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -279,8 +285,11 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ @@ -311,4 +320,5 @@ #define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 03bb1065c335..29e53ea7d764 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -5,8 +5,8 @@ #ifndef _CRYPTO_GLUE_HELPER_H #define _CRYPTO_GLUE_HELPER_H +#include <crypto/internal/skcipher.h> #include <linux/kernel.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> #include <crypto/b128ops.h> @@ -69,6 +69,31 @@ static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, return true; } +static inline bool glue_skwalk_fpu_begin(unsigned int bsize, + int fpu_blocks_limit, + struct skcipher_walk *walk, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + /* prevent sleeping if FPU is in use */ + skcipher_walk_atomise(walk); + + kernel_fpu_begin(); + return true; +} + static inline void glue_fpu_end(bool fpu_enabled) { if (fpu_enabled) @@ -139,6 +164,18 @@ extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, common_glue_func_t tweak_fn, void *tweak_ctx, void *crypt_ctx); +extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + +extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn); diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 12080d87da3b..cb8f9149f6c8 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -177,16 +177,8 @@ static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) struct desc_struct *d = get_cpu_gdt_table(cpu); tss_desc tss; - /* - * sizeof(unsigned long) coming from an extra "long" at the end - * of the iobitmap. See tss_struct definition in processor.h - * - * -1? seg base+limit should be pointing to the address of the - * last valid byte - */ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + - sizeof(unsigned long) - 1); + __KERNEL_TSS_LIMIT); write_gdt_entry(d, entry, &tss, DESC_TSS); } @@ -213,6 +205,54 @@ static inline void native_load_tr_desc(void) asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +static inline void force_reload_TR(void) +{ + struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); + tss_desc tss; + + memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); + + /* + * LTR requires an available TSS, and the TSS is currently + * busy. Make it be available so that LTR will work. + */ + tss.type = DESC_TSS; + write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); + + load_TR_desc(); +} + +DECLARE_PER_CPU(bool, need_tr_refresh); + +static inline void refresh_TR(void) +{ + DEBUG_LOCKS_WARN_ON(preemptible()); + + if (unlikely(this_cpu_read(need_tr_refresh))) { + force_reload_TR(); + this_cpu_write(need_tr_refresh, false); + } +} + +/* + * If you do something evil that corrupts the cached TSS limit (I'm looking + * at you, VMX exits), call this function. + * + * The optimization here is that the TSS limit only matters for Linux if the + * IO bitmap is in use. If the TSS limit gets forced to its minimum value, + * everything works except that IO bitmap will be ignored and all CPL 3 IO + * instructions will #GP, which is exactly what we want for normal tasks. + */ +static inline void invalidate_tss_limit(void) +{ + DEBUG_LOCKS_WARN_ON(preemptible()); + + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) + force_reload_TR(); + else + this_cpu_write(need_tr_refresh, true); +} + static inline void native_load_gdt(const struct desc_ptr *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index eb5deb42484d..49265345d4d2 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -15,7 +15,7 @@ * FIXME: Accessing the desc_struct through its fields is more elegant, * and should be the one valid thing to do. However, a lot of open code * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than an union, + * incrementally. We keep the signature as a struct, rather than a union, * so we can get rid of it transparently in the future -- glommer */ /* 8 byte segment descriptor */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 684ed6c3aa67..1b3ef26e77df 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -2,9 +2,6 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#ifdef CONFIG_X86_DEV_DMA_OPS - struct dma_map_ops *dma_ops; -#endif #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif @@ -13,7 +10,7 @@ struct dev_archdata { #if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) struct dma_domain { struct list_head node; - struct dma_map_ops *dma_ops; + const struct dma_map_ops *dma_ops; int domain_nr; }; void add_dma_domain(struct dma_domain *domain); diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index ced283ac79df..af95c47d5c9e 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) } #define div_u64_rem div_u64_rem +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + u32 high, low; + + asm ("mull %[b]" : "=a" (low), "=d" (high) + : [a] "a" (a), [b] "rm" (b) ); + + return low | ((u64)high) << 32; +} +#define mul_u32_u32 mul_u32_u32 + #else # include <asm-generic/div64.h> #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 44461626830e..08a0838b83fb 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -25,18 +25,11 @@ extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; -extern struct dma_map_ops *dma_ops; +extern const struct dma_map_ops *dma_ops; -static inline struct dma_map_ops *get_dma_ops(struct device *dev) +static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { -#ifndef CONFIG_X86_DEV_DMA_OPS return dma_ops; -#else - if (unlikely(!dev) || !dev->archdata.dma_ops) - return dma_ops; - else - return dev->archdata.dma_ops; -#endif } bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 476b574de99e..67313f3a9874 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#ifdef CONFIG_EFI +/* + * E820_X_MAX is the maximum size of the extended E820 table. The extended + * table may contain up to 3 extra E820 entries per possible NUMA node, so we + * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. + * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. + */ #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) -#else /* ! CONFIG_EFI */ -#define E820_X_MAX E820MAX -#endif + #include <uapi/asm/e820.h> + #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map *e820; @@ -26,8 +30,6 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype); extern void update_e820(void); extern void e820_setup_gap(void); -extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr, unsigned long long end_addr); struct setup_data; extern void parse_e820_ext(u64 phys_addr, u32 data_len); diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 389d700b961e..2f77bcefe6b4 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -191,6 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( struct efi_config { u64 image_handle; u64 table; + u64 runtime_services; u64 boot_services; u64 text_output; efi_status_t (*call)(unsigned long, ...); @@ -210,16 +211,26 @@ static inline bool efi_is_64bit(void) return __efi_early()->is64; } +#define efi_table_attr(table, attr, instance) \ + (efi_is_64bit() ? \ + ((table##_64_t *)(unsigned long)instance)->attr : \ + ((table##_32_t *)(unsigned long)instance)->attr) + +#define efi_call_proto(protocol, f, instance, ...) \ + __efi_early()->call(efi_table_attr(protocol, f, instance), \ + instance, ##__VA_ARGS__) + #define efi_call_early(f, ...) \ - __efi_early()->call(efi_is_64bit() ? \ - ((efi_boot_services_64_t *)(unsigned long) \ - __efi_early()->boot_services)->f : \ - ((efi_boot_services_32_t *)(unsigned long) \ - __efi_early()->boot_services)->f, __VA_ARGS__) + __efi_early()->call(efi_table_attr(efi_boot_services, f, \ + __efi_early()->boot_services), __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); +#define efi_call_runtime(f, ...) \ + __efi_early()->call(efi_table_attr(efi_runtime_services, f, \ + __efi_early()->runtime_services), __VA_ARGS__) + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e7f155c3045e..9d49c18b5ea9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -258,6 +258,15 @@ extern int force_personality32; #define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX]) +extern u32 elf_hwcap2; + +/* + * HWCAP2 supplies mask with kernel enabled CPU features, so that + * the application can discover that it can safely use them. + * The bits are defined in uapi/asm/hwcap2.h. + */ +#define ELF_HWCAP2 (elf_hwcap2) + /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in intent than poking at uname or /proc/cpuinfo. diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 1c7eefe32502..7ec59edde154 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -229,18 +229,18 @@ static struct fd_routine_l { int (*_dma_setup)(char *addr, unsigned long size, int mode, int io); } fd_routine[] = { { - request_dma, - free_dma, - get_dma_residue, - dma_mem_alloc, - hard_dma_setup + ._request_dma = request_dma, + ._free_dma = free_dma, + ._get_dma_residue = get_dma_residue, + ._dma_mem_alloc = dma_mem_alloc, + ._dma_setup = hard_dma_setup }, { - vdma_request_dma, - vdma_nop, - vdma_get_dma_residue, - vdma_mem_alloc, - vdma_dma_setup + ._request_dma = vdma_request_dma, + ._free_dma = vdma_nop, + ._get_dma_residue = vdma_get_dma_residue, + ._dma_mem_alloc = vdma_mem_alloc, + ._dma_setup = vdma_dma_setup } }; diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 1429a7c736db..0877ae018fc9 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -27,16 +27,6 @@ extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); /* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -extern int irq_ts_save(void); -extern void irq_ts_restore(int TS_state); - -/* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2737366ea583..255645f60ca2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -60,11 +60,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has(X86_FEATURE_EAGER_FPU); -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -92,6 +87,16 @@ extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif + +static inline void fpstate_init_xstate(struct xregs_state *xsave) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; +} + static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; @@ -484,42 +489,42 @@ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size) DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) +static inline void __cpu_invalidate_fpregs_state(void) { - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; + fpu->last_cpu = -1; } - -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) -{ - if (!use_eager_fpu()) - clts(); -} - -static inline void __fpregs_deactivate_hw(void) +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - if (!use_eager_fpu()) - stts(); + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ -static inline void __fpregs_deactivate(struct fpu *fpu) +/* + * These generally need preemption protection to work, + * do try to avoid using these on their own: + */ +static inline void fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -528,8 +533,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) trace_x86_fpu_regs_deactivated(fpu); } -/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ -static inline void __fpregs_activate(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { WARN_ON_FPU(fpu->fpregs_active); @@ -554,51 +558,19 @@ static inline int fpregs_active(void) } /* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void fpregs_activate(struct fpu *fpu) -{ - __fpregs_activate_hw(); - __fpregs_activate(fpu); -} - -static inline void fpregs_deactivate(struct fpu *fpu) -{ - __fpregs_deactivate(fpu); - __fpregs_deactivate_hw(); -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. * * - switch_fpu_finish() restores the new state as * necessary. */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t -switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +static inline void +switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); - if (old_fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -608,29 +580,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new_fpu->counter++; - __fpregs_activate(new_fpu); - trace_x86_fpu_regs_activated(new_fpu); - prefetch(&new_fpu->state); - } else { - __fpregs_deactivate_hw(); - } - } else { - old_fpu->counter = 0; + } else old_fpu->last_cpu = -1; - if (fpu.preload) { - new_fpu->counter++; - if (fpu_want_lazy_restore(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(&new_fpu->state); - fpregs_activate(new_fpu); - } - } - return fpu; } /* @@ -638,15 +589,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) */ /* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. + * Set up the userspace FPU context for the new task, if the task + * has used the FPU. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (fpu_switch.preload) - copy_kernel_to_fpregs(&new_fpu->state); + bool preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active; + + if (preload) { + if (!fpregs_state_valid(new_fpu, cpu)) + copy_kernel_to_fpregs(&new_fpu->state); + fpregs_activate(new_fpu); + } } /* diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 48df486b02f9..3c80f5b9c09d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,17 +322,6 @@ struct fpu { unsigned char fpregs_active; /* - * @counter: - * - * This counter contains the number of consecutive context switches - * during which the FPU stays used. If this is over a threshold, the - * lazy FPU restore logic becomes eager, to save the trap overhead. - * This is an unsigned char so that after 256 iterations the counter - * wraps and the context switch behavior turns lazy again; this is to - * deal with bursty apps that only use the FPU for a short time: - */ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore @@ -340,29 +329,6 @@ struct fpu { * the registers in the FPU are more recent than this state * copy. If the task context-switches away then they get * saved here and represent the FPU state. - * - * After context switches there may be a (short) time period - * during which the in-FPU hardware registers are unchanged - * and still perfectly match this state, if the tasks - * scheduled afterwards are not using the FPU. - * - * This is the 'lazy restore' window of optimization, which - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. - * - * We detect whether a subsequent task uses the FPU via setting - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. - * - * During this window, if the task gets scheduled again, we - * might be able to skip having to do a restore from this - * memory buffer to the hardware registers - at the cost of - * incurring the overhead of #NM fault traps. - * - * Note that on modern CPUs that support the XSAVEOPT (or other - * optimized XSAVE instructions), we don't use #NM traps anymore, - * as the hardware can track whether FPU registers need saving - * or not. On such CPUs we activate the non-lazy ('eagerfpu') - * logic, which unconditionally saves/restores all FPU state - * across context switches. (if FPU state exists.) */ union fpregs_state state; /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 430bacf73074..1b2799e0699a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,21 +21,16 @@ /* Supervisor features */ #define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) -/* Supported features which support lazy state saving */ -#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ +/* All currently supported features */ +#define XCNTXT_MASK (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PKRU) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h deleted file mode 100644 index c5d1785373ed..000000000000 --- a/arch/x86/include/asm/idle.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_X86_IDLE_H -#define _ASM_X86_IDLE_H - -#define IDLE_START 1 -#define IDLE_END 2 - -struct notifier_block; -void idle_notifier_register(struct notifier_block *n); -void idle_notifier_unregister(struct notifier_block *n); - -#ifdef CONFIG_X86_64 -void enter_idle(void); -void exit_idle(void); -#else /* !CONFIG_X86_64 */ -static inline void enter_idle(void) { } -static inline void exit_idle(void) { } -static inline void __exit_idle(void) { } -#endif /* CONFIG_X86_64 */ - -void amd_e400_remove_cpu(int cpu); - -#endif /* _ASM_X86_IDLE_H */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 34a46dc076d3..9814db42b790 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -57,8 +57,9 @@ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 49da9f497b90..fe04491130ae 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -27,7 +27,6 @@ extern void intel_mid_pwr_power_off(void); extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); extern int get_gpio_by_name(const char *name); -extern void intel_scu_device_register(struct platform_device *pdev); extern int __init sfi_parse_mrtc(struct sfi_table_header *table); extern int __init sfi_parse_mtmr(struct sfi_table_header *table); extern int sfi_mrtc_num; @@ -42,10 +41,8 @@ struct devs_id { char name[SFI_NAME_LEN + 1]; u8 type; u8 delay; + u8 msic; void *(*get_platform_data)(void *info); - /* Custom handler for devices */ - void (*device_handler)(struct sfi_device_table_entry *pentry, - struct devs_id *dev); }; #define sfi_device(i) \ diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index cd0310e186f4..4291b6a5ddf7 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -30,6 +30,7 @@ int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen, u32 dptr, u32 sptr); int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen); +int intel_pmc_s0ix_counter_read(u64 *data); #else @@ -50,6 +51,11 @@ static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, return -EINVAL; } +static inline int intel_pmc_s0ix_counter_read(u64 *data) +{ + return -EINVAL; +} + #endif /*CONFIG_INTEL_PMC_IPC*/ #endif diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h new file mode 100644 index 000000000000..0d64397cee58 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt.h @@ -0,0 +1,225 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#ifdef CONFIG_INTEL_RDT_A + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#include <asm/intel_rdt_common.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + int closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: file name + * @mode: access mode + * @kf_ops: operations + * @seq_show: show content of the file + * @write: write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @max_cbm: Largest Cache Bit Mask allowed + * @min_cbm_bits: Minimum number of consecutive bits to be set + * in a cache bit mask + * @domains: All domains for this resource + * @num_domains: Number of domains active + * @msr_base: Base MSR address for CBMs + * @tmp_cbms: Scratch space when updating schemata + * @num_tmp_cbms: Number of CBMs in tmp_cbms + * @cache_level: Which cache level defines scope of this domain + * @cbm_idx_multi: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + u32 max_cbm; + struct list_head domains; + int num_domains; + int msr_base; + u32 *tmp_cbms; + int num_tmp_cbms; + int cache_level; + int cbm_idx_multi; + int cbm_idx_offset; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @cbm: array of cache bit masks (indexed by CLOSID) + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + u32 *cbm; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->capable) + +#define for_each_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ +union cpuid_0x10_1_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +void rdt_cbm_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system. + * - Caches the per cpu CLOSid values and does the MSR write only + * when a task with a different CLOSid is scheduled in. + * + * Must be called with preemption disabled. + */ +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) { + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + int closid; + + /* + * If this task has a closid assigned, use it. + * Else use the closid assigned to this cpu. + */ + closid = current->closid; + if (closid == 0) + closid = this_cpu_read(cpu_closid); + + if (closid != state->closid) { + state->closid = closid; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); + } + } +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT_A */ +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h new file mode 100644 index 000000000000..b31081b89407 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_INTEL_RDT_COMMON_H +#define _ASM_X86_INTEL_RDT_COMMON_H + +#define MSR_IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 rmid; + u32 closid; + int rmid_usecnt; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d34bd370074b..7afb0e2f07f4 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -164,6 +164,17 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) #define virt_to_bus virt_to_phys #define bus_to_virt phys_to_virt +/* + * The default ioremap() behavior is non-cached; if you need something + * else, you probably want one of the following. + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); +#define ioremap_uc ioremap_uc + +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory @@ -178,17 +189,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * If the area you are trying to map is a PCI BAR you should have a * look at pci_iomap(). */ -extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); -extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); -#define ioremap_uc ioremap_uc - -extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); -extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, - unsigned long prot_val); - -/* - * The default ioremap() behavior is non-cached: - */ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); @@ -207,18 +207,42 @@ extern void set_iounmap_nonlazy(void); */ #define xlate_dev_kmem_ptr(p) p +/** + * memset_io Set a range of I/O memory to a constant value + * @addr: The beginning of the I/O-memory range to set + * @val: The value to set the memory to + * @count: The number of bytes to set + * + * Set a range of I/O memory to a given value. + */ static inline void memset_io(volatile void __iomem *addr, unsigned char val, size_t count) { memset((void __force *)addr, val, count); } +/** + * memcpy_fromio Copy a block of data from I/O memory + * @dst: The (RAM) destination for the copy + * @src: The (I/O memory) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data from I/O memory. + */ static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) { memcpy(dst, (const void __force *)src, count); } +/** + * memcpy_toio Copy a block of data into I/O memory + * @dst: The (I/O memory) destination for the copy + * @src: The (RAM) source for the data + * @count: The number of bytes to copy + * + * Copy a block of data to I/O memory. + */ static inline void memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) { diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 345c99cef152..793869879464 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern struct dma_map_ops nommu_dma_ops; +extern const struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index d31881188431..29a594a3b82a 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -21,7 +21,6 @@ enum die_val { DIE_NMIUNKNOWN, }; -extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index d1d1e5094c28..200581691c6e 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -21,6 +21,12 @@ * * See arch/x86/kernel/kprobes.c for x86 kprobes history. */ + +#include <asm-generic/kprobes.h> + +#define BREAKPOINT_INSTRUCTION 0xcc + +#ifdef CONFIG_KPROBES #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> @@ -32,7 +38,6 @@ struct pt_regs; struct kprobe; typedef u8 kprobe_opcode_t; -#define BREAKPOINT_INSTRUCTION 0xcc #define RELATIVEJUMP_OPCODE 0xe9 #define RELATIVEJUMP_SIZE 5 #define RELATIVECALL_OPCODE 0xe8 @@ -116,4 +121,6 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); extern int kprobe_int3_handler(struct pt_regs *regs); extern int kprobe_debug_handler(struct pt_regs *regs); + +#endif /* CONFIG_KPROBES */ #endif /* _ASM_X86_KPROBES_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..3e8c287090e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde80731f49..74ef58c8ff53 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -55,7 +55,6 @@ #define KVM_REQ_TRIPLE_FAULT 10 #define KVM_REQ_MMU_SYNC 11 #define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_DEACTIVATE_FPU 13 #define KVM_REQ_EVENT 14 #define KVM_REQ_APF_HALT 15 #define KVM_REQ_STEAL_UPDATE 16 @@ -115,7 +114,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 +#define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 @@ -191,6 +190,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +199,20 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting + * with the SVE bit in EPT PTEs. + */ +#define SPTE_SPECIAL_MASK (1ULL << 62) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -659,6 +674,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* GPA available (AMD only) */ + bool gpa_available; }; struct kvm_lpage_info { @@ -695,6 +713,7 @@ struct kvm_apic_map { /* Hyper-V emulation context */ struct kvm_hv { + struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; @@ -706,6 +725,12 @@ struct kvm_hv { HV_REFERENCE_TSC_PAGE tsc_ref; }; +enum kvm_irqchip_mode { + KVM_IRQCHIP_NONE, + KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ + KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -758,7 +783,7 @@ struct kvm_arch { spinlock_t pvclock_gtod_sync_lock; bool use_master_clock; u64 master_kernel_ns; - cycle_t master_cycle_now; + u64 master_cycle_now; struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; @@ -778,7 +803,7 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; + enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; @@ -805,6 +830,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { @@ -834,6 +860,7 @@ struct kvm_vcpu_stat { u64 hypercalls; u64 irq_injections; u64 nmi_injections; + u64 req_event; }; struct x86_instruction_info; @@ -908,8 +935,6 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); u32 (*get_pkru)(struct kvm_vcpu *vcpu); - void (*fpu_activate)(struct kvm_vcpu *vcpu); - void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -941,7 +966,7 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); - void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); + int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); @@ -1040,7 +1065,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1062,6 +1088,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1124,7 +1151,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1203,7 +1231,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); @@ -1358,7 +1386,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..d74747b031ec 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node { * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. + * @node: this node */ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + int bytes, struct kvm_page_track_notifier_node *node); + /* + * It is called when memory slot is being moved or removed + * users can drop write-protection for the pages in that memory slot + * + * @kvm: the kvm where memory slot being moved or removed + * @slot: the memory slot being moved or removed + * @node: this node + */ + void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); }; void kvm_page_track_init(struct kvm *kvm); @@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h new file mode 100644 index 000000000000..f260bef63591 --- /dev/null +++ b/arch/x86/include/asm/kvmclock.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_CLOCK_H +#define _ASM_X86_KVM_CLOCK_H + +extern struct clocksource kvm_clock; + +#endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ef01fef3eebc..6c119cfae218 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -9,7 +9,6 @@ #define LHCALL_FLUSH_TLB 5 #define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_SET_STACK 7 -#define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PMD 13 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9bd7ff5ffbcc..e63873683d4a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -97,10 +97,6 @@ #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ -/* Software defined banks */ -#define MCE_EXTENDED_BANK 128 -#define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) - #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" @@ -193,6 +189,15 @@ extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; + +enum mce_notifier_prios { + MCE_PRIO_SRAO = INT_MAX, + MCE_PRIO_EXTLOG = INT_MAX - 1, + MCE_PRIO_NFIT = INT_MAX - 2, + MCE_PRIO_EDAC = INT_MAX - 3, + MCE_PRIO_LOWEST = 0, +}; + extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -252,8 +257,10 @@ static inline void cmci_recheck(void) {} #ifdef CONFIG_X86_MCE_AMD void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); #else static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif int mce_available(struct cpuinfo_x86 *c); @@ -293,9 +300,7 @@ void do_machine_check(struct pt_regs *, long); /* * Threshold handler */ - extern void (*mce_threshold_vector)(void); -extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Deferred error interrupt handler */ extern void (*deferred_error_int_vector)(void); @@ -306,8 +311,6 @@ extern void (*deferred_error_int_vector)(void); void intel_init_thermal(struct cpuinfo_x86 *c); -void mce_log_therm_throt_event(__u64 status); - /* Interrupt Handler for core thermal thresholds */ extern int (*platform_thermal_notify)(__u64 msr_val); @@ -356,27 +359,32 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; - -#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) -struct smca_hwid_mcatype { +struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ + u8 count; /* Number of instances. */ }; -struct smca_bank_info { - struct smca_hwid_mcatype *type; - u32 type_instance; +struct smca_bank { + struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ }; -extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; +extern struct smca_bank smca_banks[MAX_NR_BANKS]; + +extern const char *smca_get_long_name(enum smca_bank_types t); + +extern int mce_threshold_create_device(unsigned int cpu); +extern int mce_threshold_remove_device(unsigned int cpu); + +#else + +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; #endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index da0d81fa0b54..daadeeea00b1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -7,18 +7,26 @@ #define native_rdmsr(msr, val1, val2) \ do { \ - u64 __val = native_read_msr((msr)); \ + u64 __val = __rdmsr((msr)); \ (void)((val1) = (u32)__val); \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) #define native_wrmsr(msr, low, high) \ - native_write_msr(msr, low, high) + __wrmsr(msr, low, high) #define native_wrmsrl(msr, val) \ - native_write_msr((msr), \ - (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)) + __wrmsr((msr), (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)) + +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; struct cpu_signature { unsigned int sig; @@ -55,12 +63,7 @@ struct ucode_cpu_info { void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; - -#ifdef CONFIG_MICROCODE -int __init microcode_init(void); -#else -static inline int __init microcode_init(void) { return 0; }; -#endif +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); @@ -131,11 +134,14 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE +int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); +extern bool initrd_gone; #else +static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 15eb75484cc0..3d57009e168b 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -40,40 +40,18 @@ struct microcode_amd { unsigned int mpb[0]; }; -static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, - unsigned int sig) -{ - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; -} - -extern int __apply_microcode_amd(struct microcode_amd *mc_amd); -extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); - #define PATCH_MAX_SIZE PAGE_SIZE #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(void); -extern int __init save_microcode_in_initrd_amd(void); +extern void load_ucode_amd_ap(unsigned int family); +extern int __init save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(void); #else static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(void) {} -static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +static inline void load_ucode_amd_ap(unsigned int family) {} +static inline int __init +save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } void reload_ucode_amd(void) {} #endif - -extern bool check_current_patch_level(u32 *rev, bool early); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 5e69154c9f07..e793fc9a9b20 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,9 +52,20 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); -extern int microcode_sanity_check(void *mc, int print_err); -extern int find_matching_signature(void *mc, unsigned int csig, int cpf); +static inline u32 intel_get_microcode_revision(void) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_IA32_UCODE_REV, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); + + return rev; +} #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 72198c64e646..f9813b6d8b80 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -31,6 +31,10 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif +#ifdef CONFIG_X86_INTEL_MPX + /* address of the bounds directory */ + void __user *bd_addr; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8e0a9fe86de4..306c7e12af55 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -47,7 +47,7 @@ struct ldt_struct { * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; - int size; + unsigned int size; }; /* diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 7a35495275a9..a0d662be4c5b 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -2,6 +2,8 @@ #define _ASM_X86_MPX_H #include <linux/types.h> +#include <linux/mm_types.h> + #include <asm/ptrace.h> #include <asm/insn.h> @@ -59,7 +61,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { - return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); + return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR); } static inline void mpx_mm_init(struct mm_struct *mm) { @@ -67,7 +69,7 @@ static inline void mpx_mm_init(struct mm_struct *mm) * NULL is theoretically a valid place to put the bounds * directory, so point this at an invalid address. */ - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index aaf59b7da98a..7c9c895432a9 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,8 +3,28 @@ #include <linux/types.h> #include <linux/interrupt.h> +#include <linux/clocksource.h> #include <asm/hyperv.h> +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HVCPUID_VERSION_FEATURES). + */ +enum hv_cpuid_function { + HVCPUID_VERSION_FEATURES = 0x00000001, + HVCPUID_VENDOR_MAXFUNCTION = 0x40000000, + HVCPUID_INTERFACE = 0x40000001, + + /* + * The remaining functions depend on the value of + * HVCPUID_INTERFACE + */ + HVCPUID_VERSION = 0x40000002, + HVCPUID_FEATURES = 0x40000003, + HVCPUID_ENLIGHTENMENT_INFO = 0x40000004, + HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005, +}; + struct ms_hyperv_info { u32 features; u32 misc_features; @@ -13,6 +33,128 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; +/* + * Declare the MSR used to setup pages used to communicate with the hypervisor. + */ +union hv_x64_msr_hypercall_contents { + u64 as_uint64; + struct { + u64 enable:1; + u64 reserved:11; + u64 guest_physical_address:52; + }; +}; + +/* + * TSC page layout. + */ + +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; + u64 reserved2[509]; +}; + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0 - Distro specific identification + * + * + */ + +#define HV_LINUX_VENDOR_ID 0x8100 + +/* + * Generate the guest ID based on the guideline described above. + */ + +static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version, + __u64 d_info2) +{ + __u64 guest_id = 0; + + guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48); + guest_id |= (d_info1 << 48); + guest_id |= (kernel_version << 16); + guest_id |= d_info2; + + return guest_id; +} + + +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (cmpxchg(&msg->header.message_type, old_msg_type, + HVMSG_NONE) != old_msg_type) + return; + + /* + * Make sure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + mb(); + + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + wrmsrl(HV_X64_MSR_EOM, 0); + } +} + +#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick) +#define hv_init_timer(timer, tick) wrmsrl(timer, tick) +#define hv_init_timer_config(config, val) wrmsrl(config, val) + +#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val) +#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val) + +#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val) +#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val) + +#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val) +#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val) + +#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index) + +#define hv_get_synint_state(int_num, val) rdmsrl(int_num, val) +#define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) + void hyperv_callback_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector @@ -25,4 +167,13 @@ void hv_setup_kexec_handler(void (*handler)(void)); void hv_remove_kexec_handler(void); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); + +#if IS_ENABLED(CONFIG_HYPERV) +extern struct clocksource *hyperv_cs; + +void hyperv_init(void); +void hyperv_report_panic(struct pt_regs *regs); +bool hv_is_hypercall_page_setup(void); +void hyperv_cleanup(void); +#endif #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 78f3760ca1f2..d8b5f8ab8ef9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -37,12 +37,16 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ + +#define MSR_PPIN_CTL 0x0000004e +#define MSR_PPIN 0x0000004f + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd #define MSR_PLATFORM_INFO 0x000000ce -#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 +#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) #define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) @@ -143,6 +147,7 @@ /* C-state Residency Counters */ #define MSR_PKG_C3_RESIDENCY 0x000003f8 #define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa #define MSR_PKG_C7_RESIDENCY 0x000003fa #define MSR_CORE_C3_RESIDENCY 0x000003fc #define MSR_CORE_C6_RESIDENCY 0x000003fd @@ -199,10 +204,17 @@ #define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B #define MSR_CORE_C1_RES 0x00000660 +#define MSR_MODULE_C6_RES_MS 0x00000664 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 +#define MSR_ATOM_CORE_RATIOS 0x0000066a +#define MSR_ATOM_CORE_VIDS 0x0000066b +#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c +#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d + + #define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 @@ -455,6 +467,7 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 +#define MSR_MISC_FEATURE_CONTROL 0x000001a4 #define MSR_MISC_PWR_MGMT 0x000001aa #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 @@ -539,6 +552,11 @@ #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) +/* MISC_FEATURE_ENABLES non-architectural features */ +#define MSR_MISC_FEATURE_ENABLES 0x00000140 + +#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT 1 + #define MSR_IA32_TSC_DEADLINE 0x000006E0 /* P4/Xeon+ specific */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index b5fee97813cd..898dba2e2e2c 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -70,17 +70,24 @@ extern struct tracepoint __tracepoint_read_msr; extern struct tracepoint __tracepoint_write_msr; extern struct tracepoint __tracepoint_rdpmc; #define msr_tracepoint_active(t) static_key_false(&(t).key) -extern void do_trace_write_msr(unsigned msr, u64 val, int failed); -extern void do_trace_read_msr(unsigned msr, u64 val, int failed); -extern void do_trace_rdpmc(unsigned msr, u64 val, int failed); +extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else #define msr_tracepoint_active(t) false -static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {} +static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif -static inline unsigned long long native_read_msr(unsigned int msr) +/* + * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR + * accessors and should not have any tracing or other functionality piggybacking + * on them - those are *purely* for accessing MSRs and nothing more. So don't even + * think of extending them - you will be slapped with a stinking trout or a frozen + * shark will reach you, wherever you are! You've been warned. + */ +static inline unsigned long long notrace __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); @@ -88,11 +95,30 @@ static inline unsigned long long native_read_msr(unsigned int msr) "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) : EAX_EDX_RET(val, low, high) : "c" (msr)); - if (msr_tracepoint_active(__tracepoint_read_msr)) - do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); + return EAX_EDX_VAL(val, low, high); } +static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) +{ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +static inline unsigned long long native_read_msr(unsigned int msr) +{ + unsigned long long val; + + val = __rdmsr(msr); + + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_read_msr(msr, val, 0); + + return val; +} + static inline unsigned long long native_read_msr_safe(unsigned int msr, int *err) { @@ -115,22 +141,21 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, } /* Can be uninlined because referenced by paravirt */ -notrace static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +static inline void notrace +native_write_msr(unsigned int msr, u32 low, u32 high) { - asm volatile("1: wrmsr\n" - "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) - : : "c" (msr), "a"(low), "d" (high) : "memory"); + __wrmsr(msr, low, high); + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } /* Can be uninlined because referenced by paravirt */ -notrace static inline int native_write_msr_safe(unsigned int msr, - unsigned low, unsigned high) +static inline int notrace +native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; + asm volatile("2: wrmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" @@ -223,7 +248,7 @@ do { \ (void)((high) = (u32)(__val >> 32)); \ } while (0) -static inline void wrmsr(unsigned msr, unsigned low, unsigned high) +static inline void wrmsr(unsigned int msr, u32 low, u32 high) { native_write_msr(msr, low, high); } @@ -231,13 +256,13 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -static inline void wrmsrl(unsigned msr, u64 val) +static inline void wrmsrl(unsigned int msr, u64 val) { native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ -static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) +static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) { return native_write_msr_safe(msr, low, high); } @@ -252,7 +277,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) __err; \ }) -static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) { int err; @@ -325,12 +350,12 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h deleted file mode 100644 index 7d3a48275394..000000000000 --- a/arch/x86/include/asm/mutex.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_X86_32 -# include <asm/mutex_32.h> -#else -# include <asm/mutex_64.h> -#endif diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h deleted file mode 100644 index e9355a84fc67..000000000000 --- a/arch/x86/include/asm/mutex_32.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_32_H -#define _ASM_X86_MUTEX_32_H - -#include <asm/alternative.h> - -/** - * __mutex_fastpath_lock - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * @fn: function to call if the original value was not 1 - * - * Change the count from 1 to a value lower than 1, and call <fn> if it - * wasn't 1 originally. This function MUST leave the value lower than 1 - * even when the "1" assertion wasn't true. - */ -#define __mutex_fastpath_lock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%eax)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1 - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 0 - * - * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>. - * In the failure case, this function is allowed to either set the value - * to 1, or to set it to a value lower than 1. - * - * If the implementation sets it to a value of lower than 1, the - * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs - * to return 0 otherwise. - */ -#define __mutex_fastpath_unlock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%eax)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to a value lower than 1, and return 0 (failure) - * if it wasn't 1 originally, or return 1 (success) otherwise. This function - * MUST leave the value lower than 1 even when the "1" assertion wasn't true. - * Additionally, if the value was < 0 originally, this function must not leave - * it to 0 on failure. - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h deleted file mode 100644 index d9850758464e..000000000000 --- a/arch/x86/include/asm/mutex_64.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_64_H -#define _ASM_X86_MUTEX_64_H - -/** - * __mutex_fastpath_lock - decrement and call function if negative - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is negative - * - * Atomically decrements @v and calls <fail_fn> if the result is negative. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_lock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " decl %0\n" - " jns %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_lock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - increment and call function if nonpositive - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is nonpositive - * - * Atomically increments @v and calls <fail_fn> if the result is nonpositive. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_unlock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " incl %0\n" - " jg %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_unlock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to 0 and return 1 (success), or return 0 (failure) - * if it wasn't 1 originally. [the fallback function is never used on - * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.] - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_64_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index f37f2d8a2989..bda3c27f0da0 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -2,6 +2,7 @@ #define _ASM_X86_MWAIT_H #include <linux/sched.h> +#include <linux/sched/idle.h> #include <asm/cpufeature.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ce932812f142..0489884fdc44 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -41,11 +41,6 @@ static inline void set_debugreg(unsigned long val, int reg) PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); } -static inline void clts(void) -{ - PVOP_VCALL0(pv_cpu_ops.clts); -} - static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); @@ -480,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, native_pmd_val(pmd)); } +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + if (sizeof(pudval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); + else + PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, + native_pud_val(pud)); +} + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); @@ -678,6 +684,11 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } +static __always_inline bool pv_vcpu_is_preempted(long cpu) +{ + return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); +} + #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0f400c0e4979..b060f962d581 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -103,8 +103,6 @@ struct pv_cpu_ops { unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); - void (*clts)(void); - unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); @@ -251,6 +249,8 @@ struct pv_mmu_ops { void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmdval); + void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pudval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -310,6 +310,8 @@ struct pv_lock_ops { void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); + + struct paravirt_callee_save vcpu_is_preempted; }; /* This contains all the paravirt structures: we get a convenient @@ -508,6 +510,18 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)op) #endif +#define PVOP_RETMASK(rettype) \ + ({ unsigned long __mask = ~0UL; \ + switch (sizeof(rettype)) { \ + case 1: __mask = 0xffUL; break; \ + case 2: __mask = 0xffffUL; break; \ + case 4: __mask = 0xffffffffUL; break; \ + default: break; \ + } \ + __mask; \ + }) + + #define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ pre, post, ...) \ ({ \ @@ -535,7 +549,7 @@ int paravirt_disable_iospace(void); paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ + __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ } \ __ret; \ }) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 84f58de08c2b..9fa03604b2b3 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -507,17 +507,6 @@ do { \ #endif -/* This is not atomic against other CPUs -- CPU preemption needs to be off */ -#define x86_test_and_clear_bit_percpu(bit, var) \ -({ \ - bool old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\t" \ - CC_SET(c) \ - : CC_OUT(c) (old__), "+m" (var) \ - : "dIr" (bit)); \ - old__; \ -}) - static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index fd74a11959de..a8b96e708c2b 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) *pmdp = pmd; } +static inline void native_set_pud(pud_t *pudp, pud_t pud) +{ +} + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte(ptep, pte); @@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp) native_set_pmd(pmdp, __pmd(0)); } +static inline void native_pud_clear(pud_t *pudp) +{ +} + static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) { @@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pud_t native_pudp_get_and_clear(pud_t *xp) +{ + return __pud(xchg((pudval_t *)xp, 0)); +} +#else +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) +#endif + /* Bit manipulation helper on pte/pgoff entry */ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, unsigned long mask, unsigned int leftshift) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index cdaa58c9b39e..72277b1028a5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,6 +121,13 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } +#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \ + defined(CONFIG_PARAVIRT)) +static inline void native_pud_clear(pud_t *pudp) +{ +} +#endif + static inline void pud_clear(pud_t *pudp) { set_pud(pudp, __pud(0)); @@ -176,6 +183,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pud { + struct { + u32 pud_low; + u32 pud_high; + }; + pud_t pud; +}; + +static inline pud_t native_pudp_get_and_clear(pud_t *pudp) +{ + union split_pud res, *orig = (union split_pud *)pudp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pud_low = xchg(&orig->pud_low, 0); + res.pud_high = orig->pud_high; + orig->pud_high = 0; + + return res.pud; +} +#else +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) +#endif + /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 437feb436efa..1cfb36b8c024 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) #define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) +#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd) return pmd_flags(pmd) & _PAGE_ACCESSED; } +static inline int pud_dirty(pud_t pud) +{ + return pud_flags(pud) & _PAGE_DIRTY; +} + +static inline int pud_young(pud_t pud) +{ + return pud_flags(pud) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline int pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; +} +#endif + #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { @@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_DEVMAP); } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline int pud_devmap(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_DEVMAP); +} +#else +static inline int pud_devmap(pud_t pud) +{ + return 0; +} +#endif #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v & ~clear); +} + +static inline pud_t pud_mkold(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_ACCESSED); +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_DIRTY); +} + +static inline pud_t pud_wrprotect(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_RW); +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); +} + +static inline pud_t pud_mkdevmap(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_DEVMAP); +} + +static inline pud_t pud_mkhuge(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_PSE); +} + +static inline pud_t pud_mkyoung(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_ACCESSED); +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_RW); +} + +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE); +} + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd) return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; } +static inline int pud_soft_dirty(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SOFT_DIRTY; +} + static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SOFT_DIRTY); @@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pud_t pud_mksoft_dirty(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SOFT_DIRTY); +} + static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pud_t pud_clear_soft_dirty(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_SOFT_DIRTY); +} + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) massage_pgprot(pgprot)); } +static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) +{ + return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte); @@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) return res; } +static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) +{ + pud_t res = *pudp; + + native_pud_clear(pudp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { @@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, native_set_pmd(pmdp, pmd); } +static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + native_set_pud(pudp, pud); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which @@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH extern int pmdp_clear_flush_young(struct vm_area_struct *vma, @@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long return native_pmdp_get_and_clear(pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + return native_pudp_get_and_clear(pudp); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b6c0b404898a..fbc73360aea0 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -27,6 +27,7 @@ struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; +extern pmd_t initial_pg_pmd[]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } @@ -75,4 +76,35 @@ do { \ #define kern_addr_valid(kaddr) (0) #endif +/* + * This is how much memory in addition to the memory covered up to + * and including _end we need mapped initially. + * We need: + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries + */ +#if PTRS_PER_PMD > 1 +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +#else +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +#endif + +/* + * Number of possible pages in the lowmem region. + * + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a + * gas warning about overflowing shift count when gas has been compiled + * with only a host target support using a 32-bit type for internal + * representation. + */ +#define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)) + #endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..73c7ccc38912 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +static inline pud_t native_pudp_get_and_clear(pud_t *xp) +{ +#ifdef CONFIG_SMP + return native_make_pud(xchg(&xp->pud, 0)); +#else + /* native_local_pudp_get_and_clear, + * but duplicated because of cyclic dependency + */ + pud_t ret = *xp; + + native_pud_clear(xp); + return ret; +#endif +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { *pgdp = pgd; @@ -116,8 +131,7 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end, - int removed); +extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h deleted file mode 100644 index aa8744c77c6d..000000000000 --- a/arch/x86/include/asm/pmc_atom.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Intel Atom SOC Power Management Controller Header File - * Copyright (c) 2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - */ - -#ifndef PMC_ATOM_H -#define PMC_ATOM_H - -/* ValleyView Power Control Unit PCI Device ID */ -#define PCI_DEVICE_ID_VLV_PMC 0x0F1C -/* CherryTrail Power Control Unit PCI Device ID */ -#define PCI_DEVICE_ID_CHT_PMC 0x229C - -/* PMC Memory mapped IO registers */ -#define PMC_BASE_ADDR_OFFSET 0x44 -#define PMC_BASE_ADDR_MASK 0xFFFFFE00 -#define PMC_MMIO_REG_LEN 0x100 -#define PMC_REG_BIT_WIDTH 32 - -/* BIOS uses FUNC_DIS to disable specific function */ -#define PMC_FUNC_DIS 0x34 -#define PMC_FUNC_DIS_2 0x38 - -/* CHT specific bits in FUNC_DIS2 register */ -#define BIT_FD_GMM BIT(3) -#define BIT_FD_ISH BIT(4) - -/* S0ix wake event control */ -#define PMC_S0IX_WAKE_EN 0x3C - -#define BIT_LPC_CLOCK_RUN BIT(4) -#define BIT_SHARED_IRQ_GPSC BIT(5) -#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18) -#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19) -#define BIT_SHARED_IRQ_GPSS BIT(20) - -#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \ - BIT_SHARED_IRQ_GPSC | \ - BIT_ORED_DEDICATED_IRQ_GPSS | \ - BIT_ORED_DEDICATED_IRQ_GPSC | \ - BIT_SHARED_IRQ_GPSS) - -/* The timers acumulate time spent in sleep state */ -#define PMC_S0IR_TMR 0x80 -#define PMC_S0I1_TMR 0x84 -#define PMC_S0I2_TMR 0x88 -#define PMC_S0I3_TMR 0x8C -#define PMC_S0_TMR 0x90 -/* Sleep state counter is in units of of 32us */ -#define PMC_TMR_SHIFT 5 - -/* Power status of power islands */ -#define PMC_PSS 0x98 - -#define PMC_PSS_BIT_GBE BIT(0) -#define PMC_PSS_BIT_SATA BIT(1) -#define PMC_PSS_BIT_HDA BIT(2) -#define PMC_PSS_BIT_SEC BIT(3) -#define PMC_PSS_BIT_PCIE BIT(4) -#define PMC_PSS_BIT_LPSS BIT(5) -#define PMC_PSS_BIT_LPE BIT(6) -#define PMC_PSS_BIT_DFX BIT(7) -#define PMC_PSS_BIT_USH_CTRL BIT(8) -#define PMC_PSS_BIT_USH_SUS BIT(9) -#define PMC_PSS_BIT_USH_VCCS BIT(10) -#define PMC_PSS_BIT_USH_VCCA BIT(11) -#define PMC_PSS_BIT_OTG_CTRL BIT(12) -#define PMC_PSS_BIT_OTG_VCCS BIT(13) -#define PMC_PSS_BIT_OTG_VCCA_CLK BIT(14) -#define PMC_PSS_BIT_OTG_VCCA BIT(15) -#define PMC_PSS_BIT_USB BIT(16) -#define PMC_PSS_BIT_USB_SUS BIT(17) - -/* CHT specific bits in PSS register */ -#define PMC_PSS_BIT_CHT_UFS BIT(7) -#define PMC_PSS_BIT_CHT_UXD BIT(11) -#define PMC_PSS_BIT_CHT_UXD_FD BIT(12) -#define PMC_PSS_BIT_CHT_UX_ENG BIT(15) -#define PMC_PSS_BIT_CHT_USB_SUS BIT(16) -#define PMC_PSS_BIT_CHT_GMM BIT(17) -#define PMC_PSS_BIT_CHT_ISH BIT(18) -#define PMC_PSS_BIT_CHT_DFX_MASTER BIT(26) -#define PMC_PSS_BIT_CHT_DFX_CLUSTER1 BIT(27) -#define PMC_PSS_BIT_CHT_DFX_CLUSTER2 BIT(28) -#define PMC_PSS_BIT_CHT_DFX_CLUSTER3 BIT(29) -#define PMC_PSS_BIT_CHT_DFX_CLUSTER4 BIT(30) -#define PMC_PSS_BIT_CHT_DFX_CLUSTER5 BIT(31) - -/* These registers reflect D3 status of functions */ -#define PMC_D3_STS_0 0xA0 - -#define BIT_LPSS1_F0_DMA BIT(0) -#define BIT_LPSS1_F1_PWM1 BIT(1) -#define BIT_LPSS1_F2_PWM2 BIT(2) -#define BIT_LPSS1_F3_HSUART1 BIT(3) -#define BIT_LPSS1_F4_HSUART2 BIT(4) -#define BIT_LPSS1_F5_SPI BIT(5) -#define BIT_LPSS1_F6_XXX BIT(6) -#define BIT_LPSS1_F7_XXX BIT(7) -#define BIT_SCC_EMMC BIT(8) -#define BIT_SCC_SDIO BIT(9) -#define BIT_SCC_SDCARD BIT(10) -#define BIT_SCC_MIPI BIT(11) -#define BIT_HDA BIT(12) -#define BIT_LPE BIT(13) -#define BIT_OTG BIT(14) -#define BIT_USH BIT(15) -#define BIT_GBE BIT(16) -#define BIT_SATA BIT(17) -#define BIT_USB_EHCI BIT(18) -#define BIT_SEC BIT(19) -#define BIT_PCIE_PORT0 BIT(20) -#define BIT_PCIE_PORT1 BIT(21) -#define BIT_PCIE_PORT2 BIT(22) -#define BIT_PCIE_PORT3 BIT(23) -#define BIT_LPSS2_F0_DMA BIT(24) -#define BIT_LPSS2_F1_I2C1 BIT(25) -#define BIT_LPSS2_F2_I2C2 BIT(26) -#define BIT_LPSS2_F3_I2C3 BIT(27) -#define BIT_LPSS2_F4_I2C4 BIT(28) -#define BIT_LPSS2_F5_I2C5 BIT(29) -#define BIT_LPSS2_F6_I2C6 BIT(30) -#define BIT_LPSS2_F7_I2C7 BIT(31) - -#define PMC_D3_STS_1 0xA4 -#define BIT_SMB BIT(0) -#define BIT_OTG_SS_PHY BIT(1) -#define BIT_USH_SS_PHY BIT(2) -#define BIT_DFX BIT(3) - -/* CHT specific bits in PMC_D3_STS_1 register */ -#define BIT_STS_GMM BIT(1) -#define BIT_STS_ISH BIT(2) - -/* PMC I/O Registers */ -#define ACPI_BASE_ADDR_OFFSET 0x40 -#define ACPI_BASE_ADDR_MASK 0xFFFFFE00 -#define ACPI_MMIO_REG_LEN 0x100 - -#define PM1_CNT 0x4 -#define SLEEP_TYPE_MASK 0xFFFFECFF -#define SLEEP_TYPE_S5 0x1C00 -#define SLEEP_ENABLE 0x2000 - -extern int pmc_atom_read(int offset, u32 *value); -extern int pmc_atom_write(int offset, u32 value); - -#endif /* PMC_ATOM_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 17f218645701..ec1f3c651150 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -24,7 +24,13 @@ static __always_inline int preempt_count(void) static __always_inline void preempt_count_set(int pc) { - raw_cpu_write_4(__preempt_count, pc); + int old, new; + + do { + old = raw_cpu_read_4(__preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 984a7bf17f6a..f385eca5407a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -104,6 +104,7 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; + __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -137,6 +138,17 @@ struct cpuinfo_x86 { u32 microcode; }; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -178,6 +190,9 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); @@ -205,6 +220,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, : "memory"); } +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); @@ -271,7 +304,7 @@ struct x86_hw_tss { u16 reserved5; u16 io_bitmap_base; -} __attribute__((packed)) ____cacheline_aligned; +} __attribute__((packed)); #endif /* @@ -309,6 +342,16 @@ struct tss_struct { DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +/* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ +#define __KERNEL_TSS_LIMIT \ + (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #endif @@ -588,43 +631,76 @@ static __always_inline void cpu_relax(void) rep_nop(); } -#define cpu_relax_lowlatency() cpu_relax() - -/* Stop speculative execution and prefetching of modified code. */ +/* + * This function forces the icache and prefetched instruction stream to + * catch up with reality in two very specific cases: + * + * a) Text was modified using one virtual address and is about to be executed + * from the same physical page at a different virtual address. + * + * b) Text was modified on a different CPU, may subsequently be + * executed on this CPU, and you want to make sure the new version + * gets executed. This generally means you're calling this in a IPI. + * + * If you're calling this for a different reason, you're probably doing + * it wrong. + */ static inline void sync_core(void) { - int tmp; - -#ifdef CONFIG_M486 /* - * Do a CPUID if available, otherwise do a jump. The jump - * can conveniently enough be the jump around CPUID. + * There are quite a few ways to do this. IRET-to-self is nice + * because it works on every CPU, at any CPL (so it's compatible + * with paravirtualization), and it never exits to a hypervisor. + * The only down sides are that it's a bit slow (it seems to be + * a bit more than 2x slower than the fastest options) and that + * it unmasks NMIs. The "push %cs" is needed because, in + * paravirtual environments, __KERNEL_CS may not be a valid CS + * value when we do IRET directly. + * + * In case NMI unmasking or performance ever becomes a problem, + * the next best option appears to be MOV-to-CR2 and an + * unconditional jump. That sequence also works on all CPUs, + * but it will fault at CPL3 (i.e. Xen PV and lguest). + * + * CPUID is the conventional way, but it's nasty: it doesn't + * exist on some 486-like CPUs, and it usually exits to a + * hypervisor. + * + * Like all of Linux's memory ordering operations, this is a + * compiler barrier as well. */ - asm volatile("cmpl %2,%1\n\t" - "jl 1f\n\t" - "cpuid\n" - "1:" - : "=a" (tmp) - : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1) - : "ebx", "ecx", "edx", "memory"); + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_32 + asm volatile ( + "pushfl\n\t" + "pushl %%cs\n\t" + "pushl $1f\n\t" + "iret\n\t" + "1:" + : "+r" (__sp) : : "memory"); #else - /* - * CPUID is a barrier to speculative execution. - * Prefetched instructions are automatically - * invalidated when modified. - */ - asm volatile("cpuid" - : "=a" (tmp) - : "0" (1) - : "ebx", "ecx", "edx", "memory"); + unsigned int tmp; + + asm volatile ( + "mov %%ss, %0\n\t" + "pushq %q0\n\t" + "pushq %%rsp\n\t" + "addq $8, (%%rsp)\n\t" + "pushfq\n\t" + "mov %%cs, %0\n\t" + "pushq %q0\n\t" + "pushq $1f\n\t" + "iretq\n\t" + "1:" + : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif } extern void select_idle_routine(const struct cpuinfo_x86 *c); -extern void init_amd_e400_c1e_mask(void); +extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; -extern bool amd_e400_c1e_detected; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 3ad741b84072..448cfe1b48cf 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -14,7 +14,7 @@ static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) #endif /* some helper functions for xen and kvm pv clock sources */ -cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); @@ -87,11 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, - u64 tsc) +u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc) { u64 delta = tsc - src->tsc_timestamp; - cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; } diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index eaba08076030..48a706f641f2 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,12 @@ static inline void queued_spin_unlock(struct qspinlock *lock) { pv_queued_spin_unlock(lock); } + +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(long cpu) +{ + return pv_vcpu_is_preempted(cpu); +} #else static inline void queued_spin_unlock(struct qspinlock *lock) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 19a2224f9e16..12af3e35edfa 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,11 +6,6 @@ #include <asm/nops.h> -static inline void native_clts(void) -{ - asm volatile("clts"); -} - /* * Volatile isn't enough to prevent the compiler from reordering the * read/write functions for the control registers and messing everything up. @@ -208,16 +203,8 @@ static inline void load_gs_index(unsigned selector) #endif -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - #endif/* CONFIG_PARAVIRT */ -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - static inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 921bea7a2708..6d391909e864 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -23,9 +23,6 @@ /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) -extern struct static_key paravirt_ticketlocks_enabled; -static __always_inline bool static_key_false(struct static_key *key); - #include <asm/qspinlock.h> /* diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 37f2e0b377ad..2e41c50ddf47 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -30,8 +30,7 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); -void stack_type_str(enum stack_type type, const char **begin, - const char **end); +const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { @@ -43,8 +42,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) addr + len > begin && addr + len <= end); } -extern int kstack_depth_to_print; - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else @@ -61,7 +58,7 @@ get_frame_pointer(struct task_struct *task, struct pt_regs *regs) if (task == current) return __builtin_frame_address(0); - return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; + return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * @@ -86,9 +83,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); - extern unsigned int code_bytes; /* The form of the top of the frame on the stack */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5cb436acd463..fcc5cd387fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -36,7 +36,10 @@ static inline void prepare_switch_to(struct task_struct *prev, asmlinkage void ret_from_fork(void); -/* data that is pointed to by thread.sp */ +/* + * This is the structure pointed to by thread.sp for an inactive task. The + * order of the fields must match the code in __switch_to_asm(). + */ struct inactive_task_frame { #ifdef CONFIG_X86_64 unsigned long r15; @@ -48,6 +51,11 @@ struct inactive_task_frame { unsigned long di; #endif unsigned long bx; + + /* + * These two fields must be together. They form a stack frame header, + * needed by get_frame_pointer(). + */ unsigned long bp; unsigned long ret_addr; }; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cf75871d2f81..6358a85e2270 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -146,4 +146,36 @@ struct pci_bus; int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); +extern bool x86_topology_update; + +#ifdef CONFIG_SCHED_MC_PRIO +#include <asm/percpu.h> + +DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +extern unsigned int __read_mostly sysctl_sched_itmt_enabled; + +/* Interface to set priority of a cpu */ +void sched_set_itmt_core_prio(int prio, int core_cpu); + +/* Interface to notify scheduler that system supports ITMT */ +int sched_set_itmt_support(void); + +/* Interface to notify scheduler that system revokes ITMT support */ +void sched_clear_itmt_support(void); + +#else /* CONFIG_SCHED_MC_PRIO */ + +#define sysctl_sched_itmt_enabled 0 +static inline void sched_set_itmt_core_prio(int prio, int core_cpu) +{ +} +static inline int sched_set_itmt_support(void) +{ + return 0; +} +static inline void sched_clear_itmt_support(void) +{ +} +#endif /* CONFIG_SCHED_MC_PRIO */ + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2fbc66c7885b..2422b14c50a7 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_exceptions, diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 9217ab1f5bf6..342e59789fcd 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu, __field(struct fpu *, fpu) __field(bool, fpregs_active) __field(bool, fpstate_active) - __field(int, counter) __field(u64, xfeatures) __field(u64, xcomp_bv) ), @@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; - __entry->counter = fpu->counter; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->fpregs_active, __entry->fpstate_active, - __entry->counter, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 38a09a13a9bc..32dd6a9e343c 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_irq_vector, diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 33b6365c22fe..f5e6f1c417df 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -29,7 +29,7 @@ static inline cycles_t get_cycles(void) return rdtsc(); } -extern struct system_counterval_t convert_art_to_tsc(cycle_t art); +extern struct system_counterval_t convert_art_to_tsc(u64 art); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); @@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable; * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ +#ifdef CONFIG_X86_TSC +extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); +extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#else +static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } +static inline void tsc_verify_tsc_adjust(bool resume) { } +static inline void check_tsc_sync_source(int cpu) { } +static inline void check_tsc_sync_target(void) { } +#endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index faf3687f1035..ea148313570f 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +#else +# define WARN_ON_IN_IRQ() +#endif + /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ - likely(!__range_not_ok(addr, size, user_addr_max())) +#define access_ok(type, addr, size) \ +({ \ + WARN_ON_IN_IRQ(); \ + likely(!__range_not_ok(addr, size, user_addr_max())); \ +}) /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 46de9ac4b990..6fa75b17aec3 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,7 +12,8 @@ struct unwind_state { struct task_struct *task; int graph_idx; #ifdef CONFIG_FRAME_POINTER - unsigned long *bp; + unsigned long *bp, *orig_sp; + struct pt_regs *regs; #else unsigned long *sp; #endif @@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) if (unwind_done(state)) return NULL; - return state->bp + 1; + return state->regs ? &state->regs->ip : state->bp + 1; +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs; } #else /* !CONFIG_FRAME_POINTER */ @@ -58,6 +67,11 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return NULL; +} + #endif /* CONFIG_FRAME_POINTER */ #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 062921ef34e9..6686820feae9 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -10,6 +10,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); +extern int is_uv_hubless(void); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -23,6 +24,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline int is_uv_system(void) { return 0; } +static inline int is_uv_hubless(void) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 097b80c989c4..72e8300b1e8a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -772,6 +772,7 @@ static inline int uv_num_possible_blades(void) /* Per Hub NMI support */ extern void uv_nmi_setup(void); +extern void uv_nmi_setup_hubless(void); /* BMC sets a bit this MMR non-zero before sending an NMI */ #define UVH_NMI_MMR UVH_SCRATCH5 @@ -799,6 +800,8 @@ struct uv_hub_nmi_s { atomic_t read_mmr_count; /* count of MMR reads */ atomic_t nmi_count; /* count of true UV NMIs */ unsigned long nmi_value; /* last value read from NMI MMR */ + bool hub_present; /* false means UV hubless system */ + bool pch_owner; /* indicates this hub owns PCH */ }; struct uv_cpu_nmi_s { diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index e728699db774..022e59714562 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -17,8 +17,8 @@ struct vsyscall_gtod_data { unsigned seq; int vclock_mode; - cycle_t cycle_last; - cycle_t mask; + u64 cycle_last; + u64 mask; u32 mult; u32 shift; @@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void) * works on all CPUs. This is volatile so that it orders * correctly wrt barrier() and to keep gcc from cleverly * hoisting it out of the calling function. + * + * If RDPID is available, use it. */ - asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + alternative_io ("lsl %[p],%[seg]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); return p; } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07a7099..cc54b7026567 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -60,6 +61,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 #define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 @@ -110,6 +112,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -399,10 +431,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -419,8 +452,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 @@ -432,8 +467,16 @@ enum vmcs_field { #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) -#define VMX_EPT_ACCESS_BIT (1ull << 8) -#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) + +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul @@ -465,6 +508,22 @@ struct vmx_msr_entry { #define ENTRY_FAIL_VMCS_LINK_PTR 4 /* + * Exit Qualifications for EPT Violations + */ +#define EPT_VIOLATION_ACC_READ_BIT 0 +#define EPT_VIOLATION_ACC_WRITE_BIT 1 +#define EPT_VIOLATION_ACC_INSTR_BIT 2 +#define EPT_VIOLATION_READABLE_BIT 3 +#define EPT_VIOLATION_WRITABLE_BIT 4 +#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) +#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) +#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) +#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) +#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) +#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 6ba793178441..7ba7e90a9ad6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -59,7 +59,7 @@ struct x86_init_irqs { /** * struct x86_init_oem - oem platform specific customizing functions - * @arch_setup: platform specific architecure setup + * @arch_setup: platform specific architecture setup * @banner: print a platform specific banner */ struct x86_init_oem { @@ -165,8 +165,25 @@ struct x86_legacy_devices { }; /** + * enum x86_legacy_i8042_state - i8042 keyboard controller state + * @X86_LEGACY_I8042_PLATFORM_ABSENT: the controller is always absent on + * given platform/subarch. + * @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller + * is absent. + * @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be + * present, the i8042 driver should probe for controller existence. + */ +enum x86_legacy_i8042_state { + X86_LEGACY_I8042_PLATFORM_ABSENT, + X86_LEGACY_I8042_FIRMWARE_ABSENT, + X86_LEGACY_I8042_EXPECTED_PRESENT, +}; + +/** * struct x86_legacy_features - legacy x86 features * + * @i8042: indicated if we expect the device to have i8042 controller + * present. * @rtc: this device has a CMOS real-time clock present * @reserve_bios_regions: boot code will search for the EBDA address and the * start of the 640k - 1M BIOS region. If false, the platform must @@ -175,6 +192,7 @@ struct x86_legacy_devices { * documentation for further details. */ struct x86_legacy_features { + enum x86_legacy_i8042_state i8042; int rtc; int reserve_bios_regions; struct x86_legacy_devices devices; @@ -188,15 +206,14 @@ struct x86_legacy_features { * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus - * @i8042_detect pre-detect if i8042 controller exists * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume - * @apic_post_init: adjust apic if neeeded + * @apic_post_init: adjust apic if needed * @legacy: legacy features * @set_legacy_features: override legacy features. Use of this callback * is highly discouraged. You should only need * this if your hardware platform requires further - * custom fine tuning far beyong what may be + * custom fine tuning far beyond what may be * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. @@ -210,7 +227,6 @@ struct x86_platform_ops { bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); - int (*i8042_detect)(void); void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a12a047184ee..f6d20f6cca12 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) return _hypercall2(int, xenpmu_op, op, arg); } +static inline int +HYPERVISOR_dm_op( + domid_t dom, unsigned int nr_bufs, void *bufs) +{ + return _hypercall3(int, dm_op, dom, nr_bufs, bufs); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f5fb840b43e8..33cbd3db97b9 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -7,7 +7,7 @@ #include <linux/pfn.h> #include <linux/mm.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c18ce67495fa..5138dacf8bb8 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,6 +7,7 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 +#define SETUP_APPLE_PROPERTIES 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -134,7 +135,8 @@ struct boot_params { __u8 eddbuf_entries; /* 0x1e9 */ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ __u8 kbd_status; /* 0x1eb */ - __u8 _pad5[3]; /* 0x1ec */ + __u8 secure_boot; /* 0x1ec */ + __u8 _pad5[2]; /* 0x1ed */ /* * The sentinel is set to a nonzero value (0xff) in header.S. * diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h new file mode 100644 index 000000000000..0bd2be5c7617 --- /dev/null +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_HWCAP2_H +#define _ASM_X86_HWCAP2_H + +/* MONITOR/MWAIT enabled in Ring 3 */ +#define HWCAP2_RING3MWAIT (1 << 0) + +#endif diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 9b1a91834ac8..3a20ccf787b8 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -73,6 +73,9 @@ */ #define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) +/* Crash MSR available */ +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) + /* * Feature identification: EBX indicates which flags were specified at * partition creation. The format is the same as the partition creation @@ -144,6 +147,11 @@ */ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) +/* + * Crash notification flag. + */ +#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) + /* MSR used to identify the guest OS. */ #define HV_X64_MSR_GUEST_OS_ID 0x40000000 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 94dc8ca434e0..cff0bb6556f8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -45,7 +45,18 @@ struct kvm_steal_time { __u64 steal; __u32 version; __u32 flags; - __u32 pad[12]; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; +}; + +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; }; #define KVM_STEAL_ALIGNMENT_BITS 5 diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 69a6e07e3149..eb6247a7009b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -28,6 +28,7 @@ struct mce { __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index ae135de547f5..835aa51c7f6e 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -6,10 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 -#ifdef CONFIG_CHECKPOINT_RESTORE -# define ARCH_MAP_VDSO_X32 0x2001 -# define ARCH_MAP_VDSO_32 0x2002 -# define ARCH_MAP_VDSO_64 0x2003 -#endif +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..84c00592d359 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,7 @@ apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o -obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ @@ -100,8 +100,6 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o -obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o -obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o @@ -123,6 +121,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o ifdef CONFIG_FRAME_POINTER obj-y += unwind_frame.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c280df6b2aa2..ea3046e0b0cf 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) struct acpi_hest_ia_corrected *cmc; struct acpi_hest_ia_error_bank *mc_bank; - if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) - return 0; - cmc = (struct acpi_hest_ia_corrected *)hest_hdr; if (!cmc->enabled) return 0; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 931ced8ca345..ae32838cac5f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/efi-bgrt.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> @@ -76,6 +77,7 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug * Hotplug side: @@ -88,6 +90,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; * ->ioapic_lock */ static DEFINE_MUTEX(acpi_ioapic_lock); +#endif /* -------------------------------------------------------------------------- Boot-time Configuration @@ -713,7 +716,7 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); } @@ -721,11 +724,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) return 0; } -int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, + int *pcpu) { int cpu; - cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED); + cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); return cpu; @@ -928,6 +932,13 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) x86_platform.legacy.devices.pnpbios = 0; } + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && + x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { + pr_debug("ACPI: i8042 controller is absent\n"); + x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; + } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { pr_debug("ACPI: not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; @@ -1548,6 +1559,12 @@ int __init early_acpi_boot_init(void) return 0; } +static int __init acpi_parse_bgrt(struct acpi_table_header *table) +{ + efi_bgrt_init(table); + return 0; +} + int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ @@ -1572,6 +1589,8 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) x86_init.pci.init = pci_acpi_init; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index af15f4444330..8233a630280f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -12,7 +12,6 @@ #include <linux/sched.h> #include <acpi/processor.h> -#include <asm/acpi.h> #include <asm/mwait.h> #include <asm/special_insns.h> @@ -89,7 +88,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) retval = 0; /* If the HW does not support any sub-states in this C-state */ if (num_cstate_subtype == 0) { - pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", + cx->address, edx_part); retval = -1; goto out; } @@ -104,8 +104,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) if (!mwait_supported[cstate_type]) { mwait_supported[cstate_type] = 1; printk(KERN_DEBUG - "Monitor-Mwait will be used to enter C-%d " - "state\n", cx->type); + "Monitor-Mwait will be used to enter C-%d state\n", + cx->type); } snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", @@ -166,6 +166,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); static int __init ffh_cstate_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + if (c->x86_vendor != X86_VENDOR_INTEL) return -1; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cb272a7a5a3..c5b8f760473c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -337,7 +337,11 @@ done: n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } -static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; @@ -346,7 +350,6 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); - sync_core(); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", @@ -359,9 +362,12 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. + * + * Marked "noinline" to cause control flow change and thus insn cache + * to refetch changed I$ lines. */ -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __init_or_module noinline apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; @@ -667,7 +673,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, unsigned long flags; local_irq_save(flags); memcpy(addr, opcode, len); - sync_core(); local_irq_restore(flags); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 63ff468a7986..df083efe6ee0 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -695,7 +696,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) return -1; } -static struct dma_map_ops gart_dma_ops = { +static const struct dma_map_ops gart_dma_ops = { .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, .map_page = gart_map_page, diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4fdf6230d93c..458da8509b75 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,8 +13,20 @@ #include <linux/spinlock.h> #include <asm/amd_nb.h> +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 + +/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +static DEFINE_MUTEX(smn_mutex); + static u32 *flush_words; +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; -EXPORT_SYMBOL(amd_nb_misc_ids); +EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, @@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { } }; -struct amd_northbridge_info amd_northbridges; -EXPORT_SYMBOL(amd_northbridges); +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); static struct pci_dev *next_northbridge(struct pci_dev *dev, const struct pci_device_id *ids) @@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, return dev; } +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} +EXPORT_SYMBOL_GPL(amd_df_indirect_read); + int amd_cache_northbridges(void) { u16 i = 0; struct amd_northbridge *nb; - struct pci_dev *misc, *link; + struct pci_dev *root, *misc, *link; - if (amd_nb_num()) + if (amd_northbridges.num) return 0; misc = NULL; @@ -74,15 +198,17 @@ int amd_cache_northbridges(void) if (!i) return -ENODEV; - nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; amd_northbridges.num = i; - link = misc = NULL; - for (i = 0; i != amd_nb_num(); i++) { + link = misc = root = NULL; + for (i = 0; i != amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, amd_root_ids); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = @@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res) { u32 address; u64 base, msr; - unsigned segn_busn_bits; + unsigned int segn_busn_bits; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return NULL; /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return NULL; address = MSR_FAM10H_MMIO_CONF_BASE; @@ -226,14 +352,14 @@ static void amd_cache_gart(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); if (!flush_words) { amd_northbridges.flags &= ~AMD_NB_GART; pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return; } - for (i = 0; i != amd_nb_num(); i++) + for (i = 0; i != amd_northbridges.num; i++) pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } @@ -246,18 +372,20 @@ void amd_flush_garts(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, flush_words[i] | 1); flushed++; } - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 456316f6c868..65721dc73bd8 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -234,7 +234,7 @@ static __init int apbt_late_init(void) if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; - return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL, + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, apbt_cpu_dead); } fs_initcall(apbt_late_init); @@ -247,7 +247,7 @@ void apbt_setup_secondary_clock(void) {} static int apbt_clocksource_register(void) { u64 start, now; - cycle_t t1; + u64 t1; /* Start the counter, use timer 2 as source, timer 0/1 for event */ dw_apb_clocksource_start(clocksource_apbt); @@ -355,7 +355,7 @@ unsigned long apbt_quick_calibrate(void) { int i, scale; u64 old, new; - cycle_t t1, t2; + u64 t1, t2; unsigned long khz = 0; u32 loop, shift; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 88c657b057e2..4261b3282ad9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -48,7 +48,6 @@ #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/idle.h> #include <asm/mtrr.h> #include <asm/time.h> #include <asm/smp.h> @@ -530,18 +529,19 @@ static void lapic_timer_broadcast(const struct cpumask *mask) * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP - | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_state_shutdown = lapic_timer_shutdown, - .set_state_periodic = lapic_timer_set_periodic, - .set_state_oneshot = lapic_timer_set_oneshot, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_state_oneshot_stopped = lapic_timer_shutdown, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); @@ -894,11 +894,13 @@ void __init setup_boot_APIC_clock(void) /* Setup the lapic or request the broadcast */ setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } void setup_secondary_APIC_clock(void) { setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } /* @@ -1244,7 +1246,7 @@ static void lapic_setup_esr(void) /** * setup_local_APIC - setup the local APIC * - * Used to setup local APIC while initializing BSP or bringin up APs. + * Used to setup local APIC while initializing BSP or bringing up APs. * Always called with preemption disabled. */ void setup_local_APIC(void) @@ -1863,14 +1865,14 @@ static void __smp_spurious_interrupt(u8 vector) "should never happen.\n", vector, smp_processor_id()); } -__visible void smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } -__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) { u8 vector = ~regs->orig_ax; @@ -1921,14 +1923,14 @@ static void __smp_error_interrupt(struct pt_regs *regs) } -__visible void smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -__visible void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); @@ -2027,8 +2029,8 @@ void disconnect_bsp_APIC(int virt_wire_setup) /* * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of - * nr_logical_cpuids is nr_cpu_ids. + * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, + * so the maximum of nr_logical_cpuids is nr_cpu_ids. * * NOTE: Reserve 0 for BSP. */ @@ -2093,7 +2095,7 @@ int __generic_processor_info(int apicid, int version, bool enabled) * Since fixing handling of boot_cpu_physical_apicid requires * another discussion and tests on each platform, we leave it * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). + * function, __generic_processor_info(). */ if (disabled_cpu_apicid != BAD_APICID && disabled_cpu_apicid != read_apic_id() && @@ -2159,21 +2161,6 @@ int __generic_processor_info(int apicid, int version, bool enabled) } /* - * This can happen on physical hotplug. The sanity check at boot time - * is done from native_smp_prepare_cpus() after num_possible_cpus() is - * established. - */ - if (topology_update_package_map(apicid, cpu) < 0) { - int thiscpu = max + disabled_cpus; - - pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENOSPC; - } - - /* * Validate version */ if (version == 0x0) { @@ -2263,6 +2250,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { /* Should happen once for each apic */ WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->native_eoi_write = (*drv)->eoi_write; (*drv)->eoi_write = eoi_write; } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 48e6d84f173e..347bb9f65737 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,7 +48,6 @@ #include <linux/bootmem.h> #include <asm/irqdomain.h> -#include <asm/idle.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/cpu.h> @@ -1108,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) ioapic = mp_find_ioapic(gsi); if (ioapic < 0) - return -1; + return -ENODEV; pin = mp_find_ioapic_pin(ioapic, gsi); idx = find_irq_entry(ioapic, pin, mp_INT); if ((flags & IOAPIC_MAP_CHECK) && idx < 0) - return -1; + return -ENODEV; return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } @@ -1876,6 +1875,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -1887,6 +1887,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_ack = irq_chip_ack_parent, .irq_eoi = ioapic_ir_ack_level, .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2116,6 +2117,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic_irq(irq_get_chip_data(0)); } + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) @@ -2137,6 +2139,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 015bbf30e3e3..c61aec7e65f4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (domain == NULL) return -ENOSYS; - return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); + return msi_domain_alloc_irqs(domain, &dev->dev, nvec); } void native_teardown_msi_irq(unsigned int irq) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d30c5e42bb1..f3557a1eb562 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(data); } -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 200af5ae9662..5a35f208ed95 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -191,7 +191,7 @@ static int x2apic_cluster_probe(void) if (!x2apic_mode) return 0; - ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu); if (ret < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 35690a168cf7..e9f8f8cdd570 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -41,40 +41,44 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); -#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) - -static enum uv_system_type uv_system_type; -static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; -static union uvh_apicid uvh_apicid; - -/* info derived from CPUID */ +static enum uv_system_type uv_system_type; +static bool uv_hubless_system; +static u64 gru_start_paddr, gru_end_paddr; +static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; +static u64 gru_dist_lmask, gru_dist_umask; +static union uvh_apicid uvh_apicid; + +/* Information derived from CPUID: */ static struct { unsigned int apicid_shift; unsigned int apicid_mask; unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ unsigned int pnode_mask; unsigned int gpa_shift; + unsigned int gnode_shift; } uv_cpuid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); + unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; -/* Set this to use hardware error handler instead of kernel panic */ +/* Set this to use hardware error handler instead of kernel panic: */ static int disable_uv_undefined_panic = 1; + unsigned long uv_undefined(char *str) { if (likely(!disable_uv_undefined_panic)) panic("UV: error: undefined MMR: %s\n", str); else pr_crit("UV: error: undefined MMR: %s\n", str); - return ~0ul; /* cause a machine fault */ + + /* Cause a machine fault: */ + return ~0ul; } EXPORT_SYMBOL(uv_undefined); @@ -85,18 +89,19 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); val = *mmr; early_iounmap(mmr, sizeof(*mmr)); + return val; } static inline bool is_GRU_range(u64 start, u64 end) { if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* base offset bits */ + u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ + u64 sl = start & gru_dist_lmask; /* Base offset bits */ u64 eu = end & gru_dist_umask; u64 el = end & gru_dist_lmask; - /* Must reside completely within a single GRU range */ + /* Must reside completely within a single GRU range: */ return (sl == gru_dist_base && el == gru_dist_base && su >= gru_first_node_paddr && su <= gru_last_node_paddr && @@ -133,13 +138,14 @@ static int __init early_get_pnodeid(void) break; case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ break; } uv_hub_info->hub_revision = uv_min_hub_revision_id; uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; - uv_cpuid.gpa_shift = 46; /* default unless changed */ + uv_cpuid.gpa_shift = 46; /* Default unless changed */ pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", node_id.s.revision, node_id.s.part_number, node_id.s.node_id, @@ -147,11 +153,12 @@ static int __init early_get_pnodeid(void) return pnode; } -/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ -#define SMT_LEVEL 0 /* leaf 0xb SMT level */ -#define INVALID_TYPE 0 /* leaf 0xb sub-leaf types */ -#define SMT_TYPE 1 -#define CORE_TYPE 2 +/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ + +#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) @@ -165,11 +172,13 @@ static void set_x2apic_bits(void) pr_info("UV: CPU does not have CPUID.11\n"); return; } + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { pr_info("UV: CPUID.11 not implemented\n"); return; } + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; do { @@ -180,8 +189,9 @@ static void set_x2apic_bits(void) } sub_index++; } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - uv_cpuid.apicid_shift = 0; - uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); uv_cpuid.socketid_shift = sid_shift; } @@ -192,10 +202,8 @@ static void __init early_get_apic_socketid_shift(void) set_x2apic_bits(); - pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", - uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); - pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", - uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } /* @@ -208,10 +216,8 @@ static void __init uv_set_apicid_hibit(void) union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; if (is_uv1_hub()) { - apicid_mask.v = - uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = - apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; } } @@ -220,20 +226,26 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) int pnodeid; int uv_apic; - if (strncmp(oem_id, "SGI", 3) != 0) + if (strncmp(oem_id, "SGI", 3) != 0) { + if (strncmp(oem_id, "NSGI", 4) == 0) { + uv_hubless_system = true; + pr_info("UV: OEM IDs %s/%s, HUBLESS\n", + oem_id, oem_table_id); + } return 0; + } if (numa_off) { pr_err("UV: NUMA is off, disabling UV support\n"); return 0; } - /* Setup early hub type field in uv_hub_info for Node 0 */ + /* Set up early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; /* * Determine UV arch type. - * SGI: UV100/1000 + * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) * SGI4: UV400 (truncated to 4 chars because of different varieties) @@ -249,31 +261,32 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); - x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; - if (!strcmp(oem_table_id, "UVX")) { /* most common */ + if (!strcmp(oem_table_id, "UVX")) { + /* This is the most common hardware variant: */ uv_system_type = UV_X2APIC; uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */ + } else if (!strcmp(oem_table_id, "UVH")) { + /* Only UV1 systems: */ uv_system_type = UV_NON_UNIQUE_APIC; - __this_cpu_write(x2apic_extra_bits, - pnodeid << uvh_apicid.s.pnode_shift); + __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); uv_set_apicid_hibit(); uv_apic = 1; - } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */ - uv_system_type = UV_LEGACY_APIC; /* very small systems */ + } else if (!strcmp(oem_table_id, "UVL")) { + /* Only used for very small systems: */ + uv_system_type = UV_LEGACY_APIC; uv_apic = 0; } else { goto badbios; } - pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", - oem_id, oem_table_id, uv_system_type, - uv_min_hub_revision_id, uv_apic); + pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); return uv_apic; @@ -294,6 +307,12 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); +int is_uv_hubless(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(is_uv_hubless); + void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); @@ -306,16 +325,18 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* the following values are used for the per node hub info struct */ -static __initdata unsigned short *_node_to_pnode; -static __initdata unsigned short _min_socket, _max_socket; -static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; -static __initdata struct uv_gam_range_entry *uv_gre_table; -static __initdata struct uv_gam_parameters *uv_gp_table; -static __initdata unsigned short *_socket_to_node; -static __initdata unsigned short *_socket_to_pnode; -static __initdata unsigned short *_pnode_to_socket; -static __initdata struct uv_gam_range_s *_gr_table; +/* The following values are used for the per node hub info struct */ +static __initdata unsigned short *_node_to_pnode; +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; + +static __initdata struct uv_gam_range_s *_gr_table; + #define SOCK_EMPTY ((unsigned short)~0) extern int uv_hub_info_version(void) @@ -324,7 +345,7 @@ extern int uv_hub_info_version(void) } EXPORT_SYMBOL(uv_hub_info_version); -/* Build GAM range lookup table */ +/* Build GAM range lookup table: */ static __init void build_uv_gr_table(void) { struct uv_gam_range_entry *gre = uv_gre_table; @@ -342,25 +363,24 @@ static __init void build_uv_gr_table(void) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { - if (!ram_limit) { /* mark hole between ram/non-ram */ + if (!ram_limit) { + /* Mark hole between RAM/non-RAM: */ ram_limit = last_limit; last_limit = gre->limit; lsid++; continue; } last_limit = gre->limit; - pr_info("UV: extra hole in GAM RE table @%d\n", - (int)(gre - uv_gre_table)); + pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table)); continue; } if (_max_socket < gre->sockid) { - pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", - gre->sockid, _max_socket, - (int)(gre - uv_gre_table)); + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table)); continue; } sid = gre->sockid - _min_socket; - if (lsid < sid) { /* new range */ + if (lsid < sid) { + /* New range: */ grt = &_gr_table[indx]; grt->base = lindx; grt->nasid = gre->nasid; @@ -369,27 +389,32 @@ static __init void build_uv_gr_table(void) lindx = indx++; continue; } - if (lsid == sid && !ram_limit) { /* update range */ - if (grt->limit == last_limit) { /* .. if contiguous */ + /* Update range: */ + if (lsid == sid && !ram_limit) { + /* .. if contiguous: */ + if (grt->limit == last_limit) { grt->limit = last_limit = gre->limit; continue; } } - if (!ram_limit) { /* non-contiguous ram range */ + /* Non-contiguous RAM range: */ + if (!ram_limit) { grt++; grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; } - grt++; /* non-contiguous/non-ram */ - grt->base = grt - _gr_table; /* base is this entry */ + /* Non-contiguous/non-RAM: */ + grt++; + /* base is this entry */ + grt->base = grt - _gr_table; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid++; } - /* shorten table if possible */ + /* Shorten table if possible */ grt++; i = grt - _gr_table; if (i < _gr_table_len) { @@ -403,16 +428,15 @@ static __init void build_uv_gr_table(void) } } - /* display resultant gam range table */ + /* Display resultant GAM range table: */ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + unsigned long start, end; int gb = grt->base; - unsigned long start = gb < 0 ? 0 : - (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; - unsigned long end = - (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; - pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", - i, grt->nasid, start, end, gb); + start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb); } } @@ -423,16 +447,19 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) pnode = uv_apicid_to_pnode(phys_apicid); phys_apicid |= uv_apicid_hibits; + val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_STARTUP; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); return 0; @@ -566,7 +593,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .irq_dest_mode = 0, /* Physical */ .target_cpus = online_target_cpus, .disable_esr = 0, @@ -627,23 +654,22 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) switch (i) { case 0: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; break; case 1: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; break; case 2: m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; break; } alias.v = uv_read_local_mmr(m_overlay); if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(m_redirect); - *base = (unsigned long)redirect.s.dest_base - << DEST_SHIFT; + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } @@ -652,8 +678,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int pshift, - int bshift, int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; @@ -678,16 +703,19 @@ static __init void map_gru_distributed(unsigned long c) int nid; gru.v = c; - /* only base bits 42:28 relevant in dist mode */ + + /* Only base bits 42:28 relevant in dist mode */ gru_dist_base = gru.v & 0x000007fff0000000UL; if (!gru_dist_base) { pr_info("UV: Map GRU_DIST base address NULL\n"); return; } + bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ + for_each_online_node(nid) { paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | gru_dist_base; @@ -695,11 +723,12 @@ static __init void map_gru_distributed(unsigned long c) gru_first_node_paddr = min(paddr, gru_first_node_paddr); gru_last_node_paddr = max(paddr, gru_last_node_paddr); } + /* Save upper (63:M) bits of address only for is_GRU_range */ gru_first_node_paddr &= gru_dist_umask; gru_last_node_paddr &= gru_dist_umask; - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", - gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); + + pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); } static __init void map_gru_high(int max_pnode) @@ -719,6 +748,7 @@ static __init void map_gru_high(int max_pnode) map_gru_distributed(gru.v); return; } + base = (gru.v & mask) >> shift; map_high("GRU", base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)base << shift); @@ -772,8 +802,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) id = mmiohs[index].id; overlay.v = uv_read_local_mmr(mmiohs[index].overlay); - pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", - id, overlay.v, overlay.s3.base, overlay.s3.m_io); + + pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io); if (!overlay.s3.enable) { pr_info("UV: %s disabled\n", id); return; @@ -784,7 +814,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) m_io = overlay.s3.m_io; mmr = mmiohs[index].redirect; n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; - min_pnode *= 2; /* convert to NASID */ + /* Convert to NASID: */ + min_pnode *= 2; max_pnode *= 2; max_io = lnasid = fi = li = -1; @@ -793,16 +824,18 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) redirect.v = uv_read_local_mmr(mmr + i * 8); nasid = redirect.s3.nasid; + /* Invalid NASID: */ if (nasid < min_pnode || max_pnode < nasid) - nasid = -1; /* invalid NASID */ + nasid = -1; if (nasid == lnasid) { li = i; - if (i != n-1) /* last entry check */ + /* Last entry check: */ + if (i != n-1) continue; } - /* check if we have a cached (or last) redirect to print */ + /* Check if we have a cached (or last) redirect to print: */ if (lnasid != -1 || (i == n-1 && nasid != -1)) { unsigned long addr1, addr2; int f, l; @@ -814,12 +847,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) f = fi; l = li; } - addr1 = (base << shift) + - f * (1ULL << m_io); - addr2 = (base << shift) + - (l + 1) * (1ULL << m_io); - pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", - id, fi, li, lnasid, addr1, addr2); + addr1 = (base << shift) + f * (1ULL << m_io); + addr2 = (base << shift) + (l + 1) * (1ULL << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) max_io = l; } @@ -827,8 +857,7 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) lnasid = nasid; } - pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", - id, base, shift, m_io, max_io); + pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io); if (max_io >= 0) map_high(id, base, shift, m_io, max_io, map_uc); @@ -841,36 +870,35 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) int shift, enable, m_io, n_io; if (is_uv3_hub() || is_uv4_hub()) { - /* Map both MMIOH Regions */ + /* Map both MMIOH regions: */ map_mmioh_high_uv3(0, min_pnode, max_pnode); map_mmioh_high_uv3(1, min_pnode, max_pnode); return; } if (is_uv1_hub()) { - mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s1.enable; - base = mmioh.s1.base; - m_io = mmioh.s1.m_io; - n_io = mmioh.s1.n_io; + mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s1.enable; + base = mmioh.s1.base; + m_io = mmioh.s1.m_io; + n_io = mmioh.s1.n_io; } else if (is_uv2_hub()) { - mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s2.enable; - base = mmioh.s2.base; - m_io = mmioh.s2.m_io; - n_io = mmioh.s2.n_io; - } else + mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s2.enable; + base = mmioh.s2.base; + m_io = mmioh.s2.m_io; + n_io = mmioh.s2.n_io; + } else { return; + } if (enable) { max_pnode &= (1 << n_io) - 1; - pr_info( - "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", - base, shift, m_io, n_io, max_pnode); + pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode); map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); } else { pr_info("UV: MMIOH disabled\n"); @@ -888,16 +916,16 @@ static __init void uv_rtc_init(void) long status; u64 ticks_per_sec; - status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, - &ticks_per_sec); + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { - printk(KERN_WARNING - "unable to determine platform RTC clock frequency, " - "guessing.\n"); - /* BIOS gives wrong value for clock freq. so guess */ + pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n"); + + /* BIOS gives wrong value for clock frequency, so guess: */ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; - } else + } else { sn_rtc_cycles_per_second = ticks_per_sec; + } } /* @@ -908,19 +936,19 @@ static void uv_heartbeat(unsigned long ignored) struct timer_list *timer = &uv_scir_info->timer; unsigned char bits = uv_scir_info->state; - /* flip heartbeat bit */ + /* Flip heartbeat bit: */ bits ^= SCIR_CPU_HEARTBEAT; - /* is this cpu idle? */ + /* Is this CPU idle? */ if (idle_cpu(raw_smp_processor_id())) bits &= ~SCIR_CPU_ACTIVITY; else bits |= SCIR_CPU_ACTIVITY; - /* update system controller interface reg */ + /* Update system controller interface reg: */ uv_set_scir_bits(bits); - /* enable next timer period */ + /* Enable next timer period: */ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } @@ -935,7 +963,7 @@ static int uv_heartbeat_enable(unsigned int cpu) add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; - /* also ensure that boot cpu is enabled */ + /* Also ensure that boot CPU is enabled: */ cpu = 0; } return 0; @@ -968,9 +996,11 @@ static __init int uv_init_heartbeat(void) { int cpu; - if (is_uv_system()) + if (is_uv_system()) { for_each_online_cpu(cpu) uv_heartbeat_enable(cpu); + } + return 0; } @@ -979,14 +1009,10 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ /* Direct Legacy VGA I/O traffic to designated IOH */ -int uv_set_vga_state(struct pci_dev *pdev, bool decode, - unsigned int command_bits, u32 flags) +int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) { int domain, bus, rc; - PR_DEVEL("devfn %x decode %d cmd %x flags %d\n", - pdev->devfn, decode, command_bits, flags); - if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) return 0; @@ -997,13 +1023,12 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, bus = pdev->bus->number; rc = uv_bios_set_legacy_vga_target(decode, domain, bus); - PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); return rc; } /* - * Called on each cpu to initialize the per_cpu UV data area. + * Called on each CPU to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet */ void uv_cpu_init(void) @@ -1030,90 +1055,79 @@ static void get_mn(struct mn *mnp) union uvh_rh_gam_config_mmr_u m_n_config; union uv3h_gr0_gam_gr_config_u m_gr_config; - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); - mnp->n_val = m_n_config.s.n_skt; + /* Make sure the whole structure is well initialized: */ + memset(mnp, 0, sizeof(*mnp)); + + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); + mnp->n_val = m_n_config.s.n_skt; + if (is_uv4_hub()) { - mnp->m_val = 0; - mnp->n_lshift = 0; + mnp->m_val = 0; + mnp->n_lshift = 0; } else if (is_uv3_hub()) { - mnp->m_val = m_n_config.s3.m_skt; - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); - mnp->n_lshift = m_gr_config.s3.m_skt; + mnp->m_val = m_n_config.s3.m_skt; + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; } else if (is_uv2_hub()) { - mnp->m_val = m_n_config.s2.m_skt; - mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + mnp->m_val = m_n_config.s2.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; } else if (is_uv1_hub()) { - mnp->m_val = m_n_config.s1.m_skt; - mnp->n_lshift = mnp->m_val; + mnp->m_val = m_n_config.s1.m_skt; + mnp->n_lshift = mnp->m_val; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; } -void __init uv_init_hub_info(struct uv_hub_info_s *hub_info) +void __init uv_init_hub_info(struct uv_hub_info_s *hi) { - struct mn mn = {0}; /* avoid unitialized warnings */ union uvh_node_id_u node_id; + struct mn mn; get_mn(&mn); - hub_info->m_val = mn.m_val; - hub_info->n_val = mn.n_val; - hub_info->m_shift = mn.m_shift; - hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0; - - hub_info->hub_revision = uv_hub_info->hub_revision; - hub_info->pnode_mask = uv_cpuid.pnode_mask; - hub_info->min_pnode = _min_pnode; - hub_info->min_socket = _min_socket; - hub_info->pnode_to_socket = _pnode_to_socket; - hub_info->socket_to_node = _socket_to_node; - hub_info->socket_to_pnode = _socket_to_pnode; - hub_info->gr_table_len = _gr_table_len; - hub_info->gr_table = _gr_table; - hub_info->gpa_mask = mn.m_val ? + hi->gpa_mask = mn.m_val ? (1UL << (mn.m_val + mn.n_val)) - 1 : (1UL << uv_cpuid.gpa_shift) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - hub_info->gnode_extra = - (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1; - - hub_info->gnode_upper = - ((unsigned long)hub_info->gnode_extra << mn.m_val); + hi->m_val = mn.m_val; + hi->n_val = mn.n_val; + hi->m_shift = mn.m_shift; + hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + hi->hub_revision = uv_hub_info->hub_revision; + hi->pnode_mask = uv_cpuid.pnode_mask; + hi->min_pnode = _min_pnode; + hi->min_socket = _min_socket; + hi->pnode_to_socket = _pnode_to_socket; + hi->socket_to_node = _socket_to_node; + hi->socket_to_pnode = _socket_to_pnode; + hi->gr_table_len = _gr_table_len; + hi->gr_table = _gr_table; + + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); + hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + hi->gnode_upper = (unsigned long)hi->gnode_extra << mn.m_val; if (uv_gp_table) { - hub_info->global_mmr_base = uv_gp_table->mmr_base; - hub_info->global_mmr_shift = uv_gp_table->mmr_shift; - hub_info->global_gru_base = uv_gp_table->gru_base; - hub_info->global_gru_shift = uv_gp_table->gru_shift; - hub_info->gpa_shift = uv_gp_table->gpa_shift; - hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1; + hi->global_mmr_base = uv_gp_table->mmr_base; + hi->global_mmr_shift = uv_gp_table->mmr_shift; + hi->global_gru_base = uv_gp_table->gru_base; + hi->global_gru_shift = uv_gp_table->gru_shift; + hi->gpa_shift = uv_gp_table->gpa_shift; + hi->gpa_mask = (1UL << hi->gpa_shift) - 1; } else { - hub_info->global_mmr_base = - uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & - ~UV_MMR_ENABLE; - hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; } - get_lowmem_redirect( - &hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top); - - hub_info->apic_pnode_shift = uv_cpuid.socketid_shift; - - /* show system specific info */ - pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", - hub_info->n_val, hub_info->m_val, - hub_info->m_shift, hub_info->n_lshift); - - pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", - hub_info->gpa_mask, hub_info->gpa_shift, - hub_info->pnode_mask, hub_info->apic_pnode_shift); + get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top); - pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", - hub_info->global_mmr_base, hub_info->global_mmr_shift, - hub_info->global_gru_base, hub_info->global_gru_shift); + hi->apic_pnode_shift = uv_cpuid.socketid_shift; - pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", - hub_info->gnode_upper, hub_info->gnode_extra); + /* Show system specific info: */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); } static void __init decode_gam_params(unsigned long ptr) @@ -1139,12 +1153,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", - "Range", "", "Size", "Type", "NASID", - "SID", "PN"); + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, @@ -1162,29 +1173,32 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) if (pnode_max < gre->pnode) pnode_max = gre->pnode; } - _min_socket = sock_min; - _max_socket = sock_max; - _min_pnode = pnode_min; - _max_pnode = pnode_max; - _gr_table_len = index; - pr_info( - "UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", - index, _min_socket, _max_socket, _min_pnode, _max_pnode); + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); } -static void __init decode_uv_systab(void) +static int __init decode_uv_systab(void) { struct uv_systab *st; int i; + if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + return 0; /* No extended UVsystab required */ + st = uv_systab; - if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub()) - return; - if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) { - pr_crit( - "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", - st->revision, UV_SYSTAB_VERSION_UV4_LATEST); - BUG(); + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { + int rev = st ? st->revision : 0; + + pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Cannot support UV operations, switching to generic PC\n"); + uv_system_type = UV_NONE; + + return -EINVAL; } for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { @@ -1205,10 +1219,11 @@ static void __init decode_uv_systab(void) break; } } + return 0; } /* - * Setup physical blade translations from UVH_NODE_PRESENT_TABLE + * Set up physical blade translations from UVH_NODE_PRESENT_TABLE * .. NB: UVH_NODE_PRESENT_TABLE is going away, * .. being replaced by GAM Range Table */ @@ -1244,14 +1259,13 @@ static void __init build_socket_tables(void) if (!gre) { if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { pr_info("UV: No UVsystab socket table, ignoring\n"); - return; /* not required */ + return; } - pr_crit( - "UV: Error: UVsystab address translations not available!\n"); + pr_crit("UV: Error: UVsystab address translations not available!\n"); BUG(); } - /* build socket id -> node id, pnode */ + /* Build socket id -> node id, pnode */ num = maxsock - minsock + 1; bytes = num * sizeof(_socket_to_node[0]); _socket_to_node = kmalloc(bytes, GFP_KERNEL); @@ -1268,27 +1282,27 @@ static void __init build_socket_tables(void) for (i = 0; i < nump; i++) _pnode_to_socket[i] = SOCK_EMPTY; - /* fill in pnode/node/addr conversion list values */ + /* Fill in pnode/node/addr conversion list values: */ pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; i = gre->sockid - minsock; + /* Duplicate: */ if (_socket_to_pnode[i] != SOCK_EMPTY) - continue; /* duplicate */ + continue; _socket_to_pnode[i] = gre->pnode; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; - pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", + pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* Set socket -> node values */ + /* Set socket -> node values: */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1304,7 +1318,7 @@ static void __init build_socket_tables(void) sockid, apicid, nid); } - /* Setup physical blade to pnode translation from GAM Range Table */ + /* Set up physical blade to pnode translation from GAM Range Table: */ bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); _node_to_pnode = kmalloc(bytes, GFP_KERNEL); BUG_ON(!_node_to_pnode); @@ -1314,8 +1328,7 @@ static void __init build_socket_tables(void) for (sockid = minsock; sockid <= maxsock; sockid++) { if (lnid == _socket_to_node[sockid - minsock]) { - _node_to_pnode[lnid] = - _socket_to_pnode[sockid - minsock]; + _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock]; break; } } @@ -1332,8 +1345,7 @@ static void __init build_socket_tables(void) pr_info("UV: Checking socket->node/pnode for identity maps\n"); if (minsock == 0) { for (i = 0; i < num; i++) - if (_socket_to_node[i] == SOCK_EMPTY || - i != _socket_to_node[i]) + if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i]) break; if (i >= num) { kfree(_socket_to_node); @@ -1354,7 +1366,7 @@ static void __init build_socket_tables(void) } } -void __init uv_system_init(void) +static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; int bytes, cpu, nodeid; @@ -1372,8 +1384,13 @@ void __init uv_system_init(void) map_low_mmrs(); - uv_bios_init(); /* get uv_systab for decoding */ - decode_uv_systab(); + /* Get uv_systab for decoding: */ + uv_bios_init(); + + /* If there's an UVsystab problem then abort UV init: */ + if (decode_uv_systab() < 0) + return; + build_socket_tables(); build_uv_gr_table(); uv_init_hub_info(&hub_info); @@ -1381,14 +1398,10 @@ void __init uv_system_init(void) if (!_node_to_pnode) boot_init_possible_blades(&hub_info); - /* uv_num_possible_blades() is really the hub count */ - pr_info("UV: Found %d hubs, %d nodes, %d cpus\n", - uv_num_possible_blades(), - num_possible_nodes(), - num_possible_cpus()); + /* uv_num_possible_blades() is really the hub count: */ + pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, - &sn_region_size, &system_serial_number); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); @@ -1401,33 +1414,31 @@ void __init uv_system_init(void) struct uv_hub_info_s *new_hub; if (__uv_hub_info_list[nodeid]) { - pr_err("UV: Node %d UV HUB already initialized!?\n", - nodeid); + pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid); BUG(); } /* Allocate new per hub info list */ - new_hub = (nodeid == 0) ? - &uv_hub_info_node0 : - kzalloc_node(bytes, GFP_KERNEL, nodeid); + new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid); BUG_ON(!new_hub); __uv_hub_info_list[nodeid] = new_hub; new_hub = uv_hub_info_list(nodeid); BUG_ON(!new_hub); *new_hub = hub_info; - /* Use information from GAM table if available */ + /* Use information from GAM table if available: */ if (_node_to_pnode) new_hub->pnode = _node_to_pnode[nodeid]; - else /* Fill in during cpu loop */ + else /* Or fill in during CPU loop: */ new_hub->pnode = 0xffff; + new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); new_hub->memory_nid = -1; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } - /* Initialize per cpu info */ + /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { int apicid = per_cpu(x86_cpu_to_apicid, cpu); int numa_node_id; @@ -1438,22 +1449,24 @@ void __init uv_system_init(void) pnode = uv_apicid_to_pnode(apicid); uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); - uv_cpu_info_per(cpu)->blade_cpu_id = - uv_cpu_hub_info(cpu)->nr_possible_cpus++; + uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; if (uv_cpu_hub_info(cpu)->memory_nid == -1) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); - if (nodeid != numa_node_id && /* init memoryless node */ + + /* Init memoryless node: */ + if (nodeid != numa_node_id && uv_hub_info_list(numa_node_id)->pnode == 0xffff) uv_hub_info_list(numa_node_id)->pnode = pnode; else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } for_each_node(nodeid) { unsigned short pnode = uv_hub_info_list(nodeid)->pnode; - /* Add pnode info for pre-GAM list nodes without cpus */ + /* Add pnode info for pre-GAM list nodes without CPUs: */ if (pnode == 0xffff) { unsigned long paddr; @@ -1479,15 +1492,30 @@ void __init uv_system_init(void) uv_scir_register_cpu_notifier(); proc_mkdir("sgi_uv", NULL); - /* register Legacy VGA I/O redirection handler */ + /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); /* * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel. + * EFI is not enabled in the kdump kernel: */ if (is_kdump_kernel()) reboot_type = BOOT_ACPI; } +/* + * There is a small amount of UV specific code needed to initialize a + * UV system that does not have a "UV HUB" (referred to as "hubless"). + */ +void __init uv_system_init(void) +{ + if (likely(!is_uv_system() && !is_uv_hubless())) + return; + + if (is_uv_system()) + uv_system_init_hub(); + else + uv_nmi_setup_hubless(); +} + apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 51287cd90bf6..5a414545e8a3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -218,7 +218,8 @@ #include <linux/apm_bios.h> #include <linux/init.h> #include <linux/time.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> #include <linux/pm.h> #include <linux/capability.h> #include <linux/device.h> @@ -234,7 +235,7 @@ #include <linux/i8253.h> #include <linux/cpuidle.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/desc.h> #include <asm/olpc.h> #include <asm/paravirt.h> @@ -905,21 +906,21 @@ static int apm_cpu_idle(struct cpuidle_device *dev, { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ - static unsigned int last_stime; /* = 0 */ - cputime_t stime; + static u64 last_stime; /* = 0 */ + u64 stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = cputime_to_jiffies(stime - last_stime); + idle_percentage = nsecs_to_jiffies(stime - last_stime); idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index c62e015b126c..de827d6ac8c2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -81,6 +81,7 @@ void common(void) { BLANK(); OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 210927ee2e74..99332f550c48 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,6 +13,10 @@ static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#include <asm/kvm_para.h> +#endif + int main(void) { #ifdef CONFIG_PARAVIRT @@ -22,6 +26,11 @@ int main(void) BLANK(); #endif +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) + OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); + BLANK(); +#endif + #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4a8697f7d4ef..52000010c62e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o +obj-y += bugs.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_X86_32) += bugs.o -obj-$(CONFIG_X86_64) += bugs_64.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o @@ -34,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1e81a37c034e..35a5d5dca2fa 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include <linux/io.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/random.h> #include <asm/processor.h> #include <asm/apic.h> @@ -20,6 +21,10 @@ #include "cpu.h" +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -308,17 +313,43 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 7; - /* get compute unit information */ - smp_num_siblings = ((ebx >> 8) & 3) + 1; - c->x86_max_cores /= smp_num_siblings; - c->cpu_core_id = ebx & 0xff; + node_id = ecx & 0xff; + smp_num_siblings = ((ebx >> 8) & 0xff) + 1; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } + + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (cpuid_edx(0x80000006)) { + if (c->x86 == 0x17) { + /* + * LLC is at the core complex level. + * Core complex id is ApicId[3]. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } + } } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); node_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = node_id; } else return; @@ -329,9 +360,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_AMD_DCM); cus_per_node = c->x86_max_cores / nodes_per_socket; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = node_id; - /* core id has to be in the [0 .. cores_per_node - 1] range */ c->cpu_core_id %= cus_per_node; } @@ -356,15 +384,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); - - /* - * Fix percpu cpu_llc_id here as LLC topology is different - * for Fam17h systems. - */ - if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) - return; - - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; #endif } @@ -537,8 +556,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ @@ -585,11 +606,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); -} -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); +} static void init_amd_k8(struct cpuinfo_x86 *c) { @@ -770,9 +796,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* 3DNow or LM implies PREFETCHW */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bd17db15a2c1..a44ef52184df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,15 +16,19 @@ #include <asm/msr.h> #include <asm/paravirt.h> #include <asm/alternative.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> void __init check_bugs(void) { identify_boot_cpu(); -#ifndef CONFIG_SMP - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + +#ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. * @@ -40,4 +44,18 @@ void __init check_bugs(void) alternative_instructions(); fpu__init_check_bugs(); +#else /* CONFIG_X86_64 */ + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +#endif } diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c deleted file mode 100644 index a972ac4c7e7d..000000000000 --- a/arch/x86/kernel/cpu/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> -#include <asm/cacheflush.h> - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..adc0ebd8bed0 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,5 +1,6 @@ -#include <linux/bitops.h> -#include <linux/kernel.h> + +#include <linux/sched.h> +#include <linux/sched/clock.h> #include <asm/cpufeature.h> #include <asm/e820.h> @@ -104,6 +105,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + + clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc9e980c68ec..b11b38c3b0bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -7,7 +7,9 @@ #include <linux/string.h> #include <linux/ctype.h> #include <linux/delay.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/clock.h> +#include <linux/sched/task.h> #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> @@ -35,6 +37,7 @@ #include <asm/desc.h> #include <asm/fpu/internal.h> #include <asm/mtrr.h> +#include <asm/hwcap2.h> #include <linux/numa.h> #include <asm/asm.h> #include <asm/bugs.h> @@ -51,6 +54,8 @@ #include "cpu.h" +u32 elf_hwcap2 __read_mostly; + /* all of these masks are initialized in setup_cpu_local_masks() */ cpumask_var_t cpu_initialized_mask; cpumask_var_t cpu_callout_mask; @@ -83,6 +88,7 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif + clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -655,6 +661,16 @@ void cpu_detect(struct cpuinfo_x86 *c) } } +static void apply_forced_caps(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -667,13 +683,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_1_EDX] = edx; } + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_7_0_EBX] = ebx; - - c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); c->x86_capability[CPUID_7_ECX] = ecx; } @@ -747,6 +764,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -800,14 +824,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) - identify_cpu_without_cpuid(c); - /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -817,6 +839,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); + } else { + identify_cpu_without_cpuid(c); + setup_clear_cpu_cap(X86_FEATURE_CPUID); } setup_force_cpu_cap(X86_FEATURE_ALWAYS); @@ -979,29 +1004,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c) } /* - * The physical to logical package id mapping is initialized from the - * acpi/mptables information. Make sure that CPUID actually agrees with - * that. + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. */ -static void sanitize_package_id(struct cpuinfo_x86 *c) +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int pkg, apicid, cpu = smp_processor_id(); + unsigned int apicid, cpu = smp_processor_id(); apicid = apic->cpu_present_to_apicid(cpu); - pkg = apicid >> boot_cpu_data.x86_coreid_bits; - if (apicid != c->initial_apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n", + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", cpu, apicid, c->initial_apicid); - c->initial_apicid = apicid; - } - if (pkg != c->phys_proc_id) { - pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n", - cpu, pkg, c->phys_proc_id); - c->phys_proc_id = pkg; } - c->logical_proc_id = topology_phys_to_logical_pkg(pkg); + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); #else c->logical_proc_id = 0; #endif @@ -1022,6 +1039,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->cu_id = 0xff; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1041,10 +1059,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); @@ -1062,6 +1077,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); + else + clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1103,10 +1120,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) * Clear/Set all flags overridden by options, need do it * before following smp all cpus cap AND. */ - for (i = 0; i < NCAPINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } + apply_forced_caps(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -1132,7 +1146,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif - sanitize_package_id(c); } /* @@ -1172,7 +1185,6 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1188,53 +1200,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) enable_sep_cpu(); #endif mtrr_ap_init(); + validate_apic_and_package_id(c); } -struct msr_range { - unsigned min; - unsigned max; -}; - -static const struct msr_range msr_range_array[] = { - { 0x00000000, 0x00000418}, - { 0xc0000000, 0xc000040b}, - { 0xc0010000, 0xc0010142}, - { 0xc0011000, 0xc001103b}, -}; - -static void __print_cpu_msr(void) -{ - unsigned index_min, index_max; - unsigned index; - u64 val; - int i; - - for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { - index_min = msr_range_array[i].min; - index_max = msr_range_array[i].max; - - for (index = index_min; index < index_max; index++) { - if (rdmsrl_safe(index, &val)) - continue; - pr_info(" MSR%08x: %016llx\n", index, val); - } - } -} - -static int show_msr; - -static __init int setup_show_msr(char *arg) -{ - int num; - - get_option(&arg, &num); - - if (num > 0) - show_msr = num; - return 1; -} -__setup("show_msr=", setup_show_msr); - static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); @@ -1268,21 +1236,13 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(", stepping: 0x%x)\n", c->x86_mask); else pr_cont(")\n"); - - print_cpu_msr(c); -} - -void print_cpu_msr(struct cpuinfo_x86 *c) -{ - if (c->cpu_index < show_msr) - __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) + if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) setup_clear_cpu_cap(bit); else return 0; @@ -1490,11 +1450,8 @@ void cpu_init(void) */ cr4_init_shadow(); - /* - * Load microcode on this cpu if a valid microcode is available. - * This is early microcode loading procedure. - */ - load_ucode_ap(); + if (cpu) + load_ucode_ap(); t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); @@ -1555,7 +1512,7 @@ void cpu_init(void) for (i = 0; i <= IO_BITMAP_LONGS; i++) t->io_bitmap[i] = ~0UL; - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); enter_lazy_tlb(&init_mm, me); @@ -1606,7 +1563,7 @@ void cpu_init(void) /* * Set up and load the per-CPU TSS and LDT */ - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); enter_lazy_tlb(&init_mm, curr); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..0a3bc19de017 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -9,6 +9,8 @@ #include <asm/pci-direct.h> #include <asm/tsc.h> #include <asm/cpufeature.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> #include "cpu.h" @@ -183,6 +185,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } + clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcd484d2bb03..fe0a615a051b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -4,6 +4,7 @@ #include <linux/bitops.h> #include <linux/smp.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> @@ -14,6 +15,9 @@ #include <asm/bugs.h> #include <asm/cpu.h> #include <asm/intel-family.h> +#include <asm/microcode_intel.h> +#include <asm/hwcap2.h> +#include <asm/elf.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -61,6 +65,46 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +static bool ring3mwait_disabled __read_mostly; + +static int __init ring3mwait_disable(char *__unused) +{ + ring3mwait_disabled = true; + return 0; +} +__setup("ring3mwait=disable", ring3mwait_disable); + +static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) +{ + /* + * Ring 3 MONITOR/MWAIT feature cannot be detected without + * cpu model and family comparison. + */ + if (c->x86 != 6) + return; + switch (c->x86_model) { + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + break; + default: + return; + } + + if (ring3mwait_disabled) { + msr_clear_bit(MSR_MISC_FEATURE_ENABLES, + MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + return; + } + + msr_set_bit(MSR_MISC_FEATURE_ENABLES, + MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + + set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + + if (c == &boot_cpu_data) + ELF_HWCAP2 |= HWCAP2_RING3MWAIT; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -78,14 +122,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { - unsigned lower_word; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* Required by the SDM */ - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); - } + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); /* * Atom erratum AAE44/AAF40/AAG38/AAH41: @@ -124,8 +162,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -565,6 +605,8 @@ static void init_intel(struct cpuinfo_x86 *c) detect_vmx_virtcap(c); init_intel_energy_perf(c); + + probe_xeon_phi_r3mwait(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..c55fb2cb2acc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -11,6 +11,7 @@ #include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/capability.h> #include <linux/sysfs.h> #include <linux/pci.h> @@ -153,6 +154,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; + unsigned int id; unsigned long size; struct amd_northbridge *nb; }; @@ -894,6 +896,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, static void ci_leaf_init(struct cacheinfo *this_leaf, struct _cpuid4_info_regs *base) { + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; this_leaf->level = base->eax.split.level; this_leaf->type = cache_type_map[base->eax.split.type]; this_leaf->coherency_line_size = @@ -920,6 +924,22 @@ static int __init_cache_level(unsigned int cpu) return 0; } +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + static int __populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; @@ -931,9 +951,12 @@ static int __populate_cache_leaves(unsigned int cpu) ret = cpuid4_cache_lookup_regs(idx, &id4_regs); if (ret) return ret; + get_cache_id(cpu, &id4_regs); ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c new file mode 100644 index 000000000000..5a533fefefa0 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -0,0 +1,403 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/cacheinfo.h> +#include <linux/cpuhotplug.h> + +#include <asm/intel-family.h> +#include <asm/intel_rdt.h> + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) + +struct rdt_resource rdt_resources_all[] = { + { + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 0 + }, + { + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 1 + }, + { + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 2, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, +}; + +static int cbm_idx(struct rdt_resource *r, int closid) +{ + return closid * r->cbm_idx_multi + r->cbm_idx_offset; +} + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cach mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline bool cache_alloc_hsw_probe(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return false; + rdmsr(IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return false; + + r->num_closid = 4; + r->cbm_len = 20; + r->max_cbm = max_cbm; + r->min_cbm_bits = 2; + r->capable = true; + r->enabled = true; + + return true; + } + + return false; +} + +static void rdt_get_config(int idx, struct rdt_resource *r) +{ + union cpuid_0x10_1_eax eax; + union cpuid_0x10_1_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->cbm_len = eax.split.cbm_len + 1; + r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->capable = true; + r->enabled = true; +} + +static void rdt_get_cdp_l3_config(int type) +{ + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &rdt_resources_all[type]; + + r->num_closid = r_l3->num_closid / 2; + r->cbm_len = r_l3->cbm_len; + r->max_cbm = r_l3->max_cbm; + r->capable = true; + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + r->enabled = false; +} + +static inline bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + return ret; +} + +static int get_cache_id(int cpu, int level) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) + return ci->info_list[i].id; + } + + return -1; +} + +void rdt_cbm_update(void *arg) +{ + struct msr_param *m = (struct msr_param *)arg; + struct rdt_resource *r = m->res; + int i, cpu = smp_processor_id(); + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + goto found; + } + pr_info_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); + + return; + +found: + for (i = m->low; i < m->high; i++) { + int idx = cbm_idx(r, i); + + wrmsrl(r->msr_base + idx, d->cbm[i]); + } +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(id); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int i, id = get_cache_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + return; + } + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->id = id; + + d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); + if (!d->cbm) { + kfree(d); + return; + } + + for (i = 0; i < r->num_closid; i++) { + int idx = cbm_idx(r, i); + + d->cbm[i] = r->max_cbm; + wrmsrl(r->msr_base + idx, d->cbm[i]); + } + + cpumask_set_cpu(cpu, &d->cpu_mask); + list_add_tail(&d->list, add_pos); + r->num_domains++; +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cache_id(cpu, r->cache_level); + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + r->num_domains--; + kfree(d->cbm); + list_del(&d->list); + kfree(d); + } +} + +static void clear_closid(int cpu) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + + per_cpu(cpu_closid, cpu) = 0; + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); +} + +static int intel_rdt_online_cpu(unsigned int cpu) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int intel_rdt_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + break; + } + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int __init intel_rdt_late_init(void) +{ + struct rdt_resource *r; + int state, ret; + + if (!get_rdt_resources()) + return -ENODEV; + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/rdt/cat:online:", + intel_rdt_online_cpu, intel_rdt_offline_cpu); + if (state < 0) + return state; + + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + + for_each_capable_rdt_resource(r) + pr_info("Intel RDT %s allocation detected\n", r->name); + + return 0; +} + +late_initcall(intel_rdt_late_init); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c new file mode 100644 index 000000000000..0bbe0f3a039f --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -0,0 +1,1116 @@ +/* + * User interface for Resource Alloction in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/task_work.h> + +#include <uapi/linux/magic.h> + +#include <asm/intel_rdt.h> +#include <asm/intel_rdt_common.h> + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; + +static void closid_init(void) +{ + struct rdt_resource *r; + int rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + for_each_enabled_rdt_resource(r) + rdt_min_closid = min(rdt_min_closid, r->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; +} + +int closid_alloc(void) +{ + int closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +static void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, + int len) +{ + struct rftype *rft; + int ret; + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) + kernfs_remove_by_name(kn, rft->name); + return ret; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * This is safe against intel_rdt_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from rdt_update_closid() is proteced against __switch_to() because + * preemption is disabled. + */ +static void rdt_update_cpu_closid(void *closid) +{ + if (closid) + this_cpu_write(cpu_closid, *(int *)closid); + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + intel_rdt_sched_in(); +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids must have been set up before calling this function. + * + * The per cpu closids are updated with the smp function call, when @closid + * is not NULL. If @closid is NULL then all affected percpu closids must + * have been set up before calling this function. + */ +static void +rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_update_cpu_closid(closid); + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + put_cpu(); +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask; + struct rdtgroup *rdtgrp, *r; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + ret = cpumask_parse(buf, newmask); + if (ret) + goto unlock; + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (cpumask_weight(tmpmask)) { + ret = -EINVAL; + goto unlock; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + ret = -EINVAL; + goto unlock; + } + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + rdt_update_closid(tmpmask, &rdtgroup_default.closid); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu closid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); + } + rdt_update_closid(tmpmask, &rdtgrp->closid); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + + return ret ?: nbytes; +} + +struct task_move_callback { + struct callback_head work; + struct rdtgroup *rdtgrp; +}; + +static void move_myself(struct callback_head *head) +{ + struct task_move_callback *callback; + struct rdtgroup *rdtgrp; + + callback = container_of(head, struct task_move_callback, work); + rdtgrp = callback->rdtgrp; + + /* + * If resource group was deleted before this task work callback + * was invoked, then assign the task to root group and free the + * resource group. + */ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + current->closid = 0; + kfree(rdtgrp); + } + + preempt_disable(); + /* update PQR_ASSOC MSR to make resource group go into effect */ + intel_rdt_sched_in(); + preempt_enable(); + + kfree(callback); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + struct task_move_callback *callback; + int ret; + + callback = kzalloc(sizeof(*callback), GFP_KERNEL); + if (!callback) + return -ENOMEM; + callback->work.func = move_myself; + callback->rdtgrp = rdtgrp; + + /* + * Take a refcount, so rdtgrp cannot be freed before the + * callback has been invoked. + */ + atomic_inc(&rdtgrp->waitcount); + ret = task_work_add(tsk, &callback->work, true); + if (ret) { + /* + * Task is exiting. Drop the refcount and free the callback. + * No need to check the refcount as the group cannot be + * deleted before the write function unlocks rdtgroup_mutex. + */ + atomic_dec(&rdtgrp->waitcount); + kfree(callback); + } else { + tsk->closid = rdtgrp->closid; + } + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EPERM; + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + ret = rdtgroup_move_task(pid, rdtgrp, of); + else + ret = -ENOENT; + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (t->closid == r->closid) + seq_printf(s, "%d\n", t->pid); + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* Files in each rdtgroup */ +static struct rftype rdtgroup_base_files[] = { + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + }, +}; + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_closid); + + return 0; +} + +static int rdt_cbm_mask_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->max_cbm); + + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->min_cbm_bits); + + return 0; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_cbm_mask_show, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + }, +}; + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct kernfs_node *kn_subdir; + struct rdt_resource *r; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + kernfs_get(kn_info); + + for_each_enabled_rdt_resource(r) { + kn_subdir = kernfs_create_dir(kn_info, r->name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) { + ret = PTR_ERR(kn_subdir); + goto out_destroy; + } + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + goto out_destroy; + ret = rdtgroup_add_files(kn_subdir, res_info_files, + ARRAY_SIZE(res_info_files)); + if (ret) + goto out_destroy; + kernfs_activate(kn_subdir); + } + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn_info); + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +{ + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + list_for_each_entry(d, &r->domains, list) { + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + cpu = get_cpu(); + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + l3_qos_cfg_update(&enable); + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +static int cdp_enable(void) +{ + struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + int ret; + + if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + return -EINVAL; + + ret = set_l3_qos_cfg(r_l3, true); + if (!ret) { + r_l3->enabled = false; + r_l3data->enabled = true; + r_l3code->enabled = true; + } + return ret; +} + +static void cdp_disable(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + + r->enabled = r->capable; + + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + set_l3_qos_cfg(r, false); + } +} + +static int parse_rdtgroupfs_options(char *data) +{ + char *token, *o = data; + int ret = 0; + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + + if (!strcmp(token, "cdp")) + ret = cdp_enable(); + } + + return ret; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { + return kn->parent->priv; + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + kernfs_unbreak_active_protection(kn); + kernfs_put(kn); + kfree(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +static struct dentry *rdt_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct dentry *dentry; + int ret; + + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + dentry = ERR_PTR(-EBUSY); + goto out; + } + + ret = parse_rdtgroupfs_options(data); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + closid_init(); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + dentry = kernfs_mount(fs_type, flags, rdt_root, + RDTGROUP_SUPER_MAGIC, NULL); + if (IS_ERR(dentry)) + goto out_cdp; + + static_branch_enable(&rdt_enable_key); + goto out; + +out_cdp: + cdp_disable(); +out: + mutex_unlock(&rdtgroup_mutex); + + return dentry; +} + +static int reset_all_cbms(struct rdt_resource *r) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i, cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = r->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < r->num_closid; i++) + d->cbm[i] = r->max_cbm; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || t->closid == from->closid) { + t->closid = to->closid; +#ifdef CONFIG_SMP + /* + * This is safe on x86 w/o barriers as the ordering + * of writing to task_cpu() and t->on_cpu is + * reverse to the reading here. The detection is + * inaccurate as tasks might move or schedule + * before the smp function call takes place. In + * such a case the function call is pointless, but + * there is no other side effect. + */ + if (mask && t->on_cpu) + cpumask_set_cpu(task_cpu(t), mask); +#endif + } + } + read_unlock(&tasklist_lock); +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + get_online_cpus(); + rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + put_online_cpus(); + + kernfs_remove(kn_info); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + + /*Put everything back to default values. */ + for_each_enabled_rdt_resource(r) + reset_all_cbms(r); + cdp_disable(); + rmdir_all_sub(); + static_branch_disable(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .mount = rdt_mount, + .kill_sb = rdt_kill_sb, +}; + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct rdtgroup *parent, *rdtgrp; + struct kernfs_node *kn; + int ret, closid; + + /* Only allow mkdir in the root directory */ + if (parent_kn != rdtgroup_default.kn) + return -EPERM; + + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = rdtgroup_kn_lock_live(parent_kn); + if (!parent) { + ret = -ENODEV; + goto out_unlock; + } + + ret = closid_alloc(); + if (ret < 0) + goto out_unlock; + closid = ret; + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + goto out_closid_free; + } + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_cancel_ref; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn} call below. Take one extra reference + * here, which will be dropped inside rdtgroup_kn_unlock(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = rdtgroup_add_files(kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + ret = 0; + goto out_unlock; + +out_destroy: + kernfs_remove(rdtgrp->kn); +out_cancel_ref: + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); +out_closid_free: + closid_free(closid); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + int ret, cpu, closid = rdtgroup_default.closid; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(cpu_closid, cpu) = closid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + rdt_update_closid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + closid_free(rdtgrp->closid); + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + ret = 0; +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + seq_puts(seq, ",cdp"); + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .show_options = rdtgroup_show_options, +}; + +static int __init rdtgroup_setup_root(void) +{ + int ret; + + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + + rdtgroup_default.kn = rdt_root->kn; + kernfs_activate(rdtgroup_default.kn); + +out: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c new file mode 100644 index 000000000000..f369cb8db0d5 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -0,0 +1,245 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <asm/intel_rdt.h> + +/* + * Check whether a cache bit mask is valid. The SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + */ +static bool cbm_validate(unsigned long var, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit; + + if (var == 0 || var > r->max_cbm) + return false; + + first_bit = find_first_bit(&var, r->cbm_len); + zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + + if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + return false; + + if ((zero_bit - first_bit) < r->min_cbm_bits) + return false; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(char *buf, struct rdt_resource *r) +{ + unsigned long data; + int ret; + + ret = kstrtoul(buf, 16, &data); + if (ret) + return ret; + if (!cbm_validate(data, r)) + return -EINVAL; + r->tmp_cbms[r->num_tmp_cbms++] = data; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must appear in the + * right order. + */ +static int parse_line(char *line, struct rdt_resource *r) +{ + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + list_for_each_entry(d, &r->domains, list) { + dom = strsep(&line, ";"); + if (!dom) + return -EINVAL; + id = strsep(&dom, "="); + if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) + return -EINVAL; + if (parse_cbm(dom, r)) + return -EINVAL; + } + + /* Any garbage at the end of the line? */ + if (line && line[0]) + return -EINVAL; + return 0; +} + +static int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu, idx = 0; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on other cpus. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int closid, ret = 0; + u32 *l3_cbms = NULL; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + closid = rdtgrp->closid; + + /* get scratch space to save all the masks while we validate input */ + for_each_enabled_rdt_resource(r) { + r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), + GFP_KERNEL); + if (!r->tmp_cbms) { + ret = -ENOMEM; + goto out; + } + r->num_tmp_cbms = 0; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strsep(&tok, ":"); + if (!tok) { + ret = -EINVAL; + goto out; + } + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && + closid < r->num_closid) { + ret = parse_line(tok, r); + if (ret) + goto out; + break; + } + } + if (!r->name) { + ret = -EINVAL; + goto out; + } + } + + /* Did the parser find all the masks we need? */ + for_each_enabled_rdt_resource(r) { + if (r->num_tmp_cbms != r->num_domains) { + ret = -EINVAL; + goto out; + } + } + + for_each_enabled_rdt_resource(r) { + ret = update_domains(r, closid); + if (ret) + goto out; + } + +out: + rdtgroup_kn_unlock(of->kn); + for_each_enabled_rdt_resource(r) { + kfree(r->tmp_cbms); + r->tmp_cbms = NULL; + } + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + + seq_printf(s, "%s:", r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int closid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + closid = rdtgrp->closid; + for_each_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83f1a98d37db..2eee85379689 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; - if (severity >= GHES_SEV_PANIC) + + if (severity >= GHES_SEV_PANIC) { m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } m.addr = mem_err->physical_addr; mce_log(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 93d824ec3120..1e5a50c11d3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void) return new_head.first; } -void mce_gen_pool_process(void) +void mce_gen_pool_process(struct work_struct *__unused) { struct llist_node *head; struct mce_evt_llist *node, *tmp; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 517619ea6498..99165b206df3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -152,7 +152,6 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; @@ -192,9 +191,7 @@ static void raise_mce(struct mce *m) raise_local(); put_cpu(); put_online_cpus(); - } else -#endif - { + } else { preempt_disable(); raise_local(); preempt_enable(); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index cd74a3f00aea..903043e6a62b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -31,7 +31,7 @@ struct mce_evt_llist { struct mce mce; }; -void mce_gen_pool_process(void); +void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 631356c8cca4..87cc9ab7a13c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <asm/mce.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "mce-internal.h" @@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e *msg = s->msg; s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (panic_on_oops || tolerant < 1) + if (tolerant < 1) return MCE_PANIC_SEVERITY; } return s->sev; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7fdf453d895..8e9725c607ea 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <linux/jump_label.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -127,7 +128,6 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -135,6 +135,9 @@ void mce_setup(struct mce *m) m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + rdmsrl(MSR_PPIN, m->ppin); } DEFINE_PER_CPU(struct mce, injectm); @@ -207,11 +210,13 @@ EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +static atomic_t num_notifiers; + void mce_register_decode_chain(struct notifier_block *nb) { - /* Ensure SRAO notifier has the highest priority in the decode chain. */ - if (nb != &mce_srao_nb && nb->priority == INT_MAX) - nb->priority -= 1; + atomic_inc(&num_notifiers); + + WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); } @@ -219,6 +224,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { + atomic_dec(&num_notifiers); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -270,17 +277,17 @@ struct mca_msr_regs msr_ops = { .misc = misc_reg }; -static void print_mce(struct mce *m) +static void __print_mce(struct mce *m) { - int ret = 0; - - pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", - m->extcpu, m->mcgstatus, m->bank, m->status); + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); if (m->ip) { pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); @@ -308,6 +315,13 @@ static void print_mce(struct mce *m) pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); +} + +static void print_mce(struct mce *m) +{ + int ret = 0; + + __print_mce(m); /* * Print out human-readable details about the MCE error, @@ -499,7 +513,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_gen_pool_empty() && keventd_up()) + if (!mce_gen_pool_empty()) schedule_work(&mce_work); } @@ -566,7 +580,33 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, } static struct notifier_block mce_srao_nb = { .notifier_call = srao_decode_notifier, - .priority = INT_MAX, + .priority = MCE_PRIO_SRAO, +}; + +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* + * Run the default notifier if we have only the SRAO + * notifier and us registered. + */ + if (atomic_read(&num_notifiers) > 2) + return NOTIFY_DONE; + + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = MCE_PRIO_LOWEST, }; /* @@ -667,6 +707,9 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -674,14 +717,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; - /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -696,9 +737,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) @@ -1109,6 +1147,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) goto out; mce_gather_info(&m, regs); + m.tsc = rdtsc(); final = this_cpu_ptr(&mces_seen); *final = m; @@ -1275,41 +1314,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Action optional processing happens here (picking up - * from the list of faulting pages that do_machine_check() - * placed into the genpool). - */ -static void mce_process_work(struct work_struct *dummy) -{ - mce_gen_pool_process(); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). @@ -1326,20 +1330,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static void __restart_timer(struct timer_list *t, unsigned long interval) +static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; unsigned long flags; local_irq_save(flags); - if (timer_pending(t)) { - if (time_before(when, t->expires)) - mod_timer(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); - } + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); local_irq_restore(flags); } @@ -1355,7 +1354,7 @@ static void mce_timer_fn(unsigned long data) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1374,7 +1373,7 @@ static void mce_timer_fn(unsigned long data) done: __this_cpu_write(mce_next_interval, iv); - __restart_timer(t, iv); + __start_timer(t, iv); } /* @@ -1385,7 +1384,7 @@ void mce_timer_kick(unsigned long interval) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned long iv = __this_cpu_read(mce_next_interval); - __restart_timer(t, interval); + __start_timer(t, interval); if (interval < iv) __this_cpu_write(mce_next_interval, interval); @@ -1732,17 +1731,23 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) } } -static void mce_start_timer(unsigned int cpu, struct timer_list *t) +static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; if (mca_cfg.ignore_ce || !iv) return; - per_cpu(mce_next_interval, cpu) = iv; + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} - t->expires = round_jiffies(jiffies + iv); - add_timer_on(t, cpu); +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_pinned_timer(t, mce_timer_fn, cpu); } static void __mcheck_cpu_init_timer(void) @@ -1751,7 +1756,7 @@ static void __mcheck_cpu_init_timer(void) unsigned int cpu = smp_processor_id(); setup_pinned_timer(t, mce_timer_fn, cpu); - mce_start_timer(cpu, t); + mce_start_timer(t); } /* Handle unconfigured int18 (should never happen) */ @@ -1796,7 +1801,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_init_timer(); + __mcheck_cpu_setup_timer(); } /* @@ -2138,9 +2143,10 @@ int __init mcheck_init(void) { mcheck_intel_therm_init(); mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); - INIT_WORK(&mce_work, mce_process_work); + INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); return 0; @@ -2255,8 +2261,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); @@ -2409,6 +2413,10 @@ static int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + dev = kzalloc(sizeof *dev, GFP_KERNEL); if (!dev) return -ENOMEM; @@ -2468,28 +2476,25 @@ static void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void mce_disable_cpu(void) { - unsigned long action = *(unsigned long *)h; - if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_clear(); vendor_disable_error_reporting(); } -static void mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void) { - unsigned long action = *(unsigned long *)h; int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_reenable(); for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; @@ -2499,45 +2504,43 @@ static void mce_reenable_cpu(void *h) } } -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int mce_cpu_dead(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); + mce_intel_hcpu_update(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - mce_device_create(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_device_remove(cpu); - mce_intel_hcpu_update(cpu); + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} - /* intentionally ignoring frozen here */ - if (!(action & CPU_TASKS_FROZEN)) - cmci_rediscover(); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - del_timer_sync(t); - break; - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - mce_start_timer(cpu, t); - break; - } +static int mce_cpu_online(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + int ret; - return NOTIFY_OK; + mce_device_create(cpu); + + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; + } + mce_reenable_cpu(); + mce_start_timer(t); + return 0; } -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} static __init void mce_init_banks(void) { @@ -2559,8 +2562,8 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { + enum cpuhp_state hp_online; int err; - int i = 0; if (!mce_available(&boot_cpu_data)) { err = -EIO; @@ -2578,23 +2581,16 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = mce_device_create(i); - if (err) { - /* - * Register notifier anyway (and do not unreg it) so - * that we don't leave undeleted timers, see notifier - * callback above. - */ - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - goto err_device_create; - } - } + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + hp_online = err; register_syscore_ops(&mce_syscore_ops); @@ -2607,16 +2603,10 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); + cpuhp_remove_state(hp_online); -err_device_create: - /* - * We didn't keep track of which devices were created above, but - * even if we had, the set of online cpus might have changed. - * Play safe and remove for every possible cpu, since - * mce_device_remove() will do the right thing. - */ - for_each_possible_cpu(i) - mce_device_remove(i); +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); err_out_mem: free_cpumask_var(mce_device_initialized); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9b5403462936..524cc5780a77 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -24,7 +24,6 @@ #include <asm/amd_nb.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -55,6 +54,8 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 +static bool thresholding_en; + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; -struct smca_bank_name smca_bank_names[] = { +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, @@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = { [SMCA_PSP] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(smca_bank_names); -static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { +const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ /* ZN Core (HWID=0xB0) MCA types */ @@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -struct smca_bank_info smca_banks[MAX_NR_BANKS]; +struct smca_bank smca_banks[MAX_NR_BANKS]; EXPORT_SYMBOL_GPL(smca_banks); /* @@ -142,35 +164,35 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -/* - * CPU Initialization - */ - static void get_smca_bank_info(unsigned int bank) { unsigned int i, hwid_mcatype, cpu = smp_processor_id(); - struct smca_hwid_mcatype *type; - u32 high, instanceId; - u16 hwid, mcatype; + struct smca_hwid *s_hwid; + u32 high, instance_id; /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } - hwid = high & MCI_IPID_HWID; - mcatype = (high & MCI_IPID_MCATYPE) >> 16; - hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { - type = &smca_hwid_mcatypes[i]; - if (hwid_mcatype == type->hwid_mcatype) { - smca_banks[bank].type = type; - smca_banks[bank].type_instance = instanceId; + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + + WARN(smca_banks[bank].hwid, + "Bank %s already initialized!\n", + smca_get_name(s_hwid->bank_type)); + + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = instance_id; + smca_banks[bank].sysfs_id = s_hwid->count++; break; } } @@ -533,6 +555,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + static void __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { @@ -556,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) mce_setup(&m); m.status = status; - m.bank = bank; + m.bank = bank; + m.tsc = rdtsc(); if (threshold_err) m.misc = misc; @@ -593,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void) deferred_error_int_vector(); } -asmlinkage __visible void smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); __smp_deferred_error_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -645,6 +868,7 @@ static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + struct thresh_restart tr; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -681,6 +905,11 @@ static void amd_threshold_interrupt(void) log: __log_error(bank, false, true, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); } /* @@ -826,10 +1055,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].type) + if (!smca_banks[bank].hwid) return NULL; - bank_type = smca_banks[bank].type->bank_type; + bank_type = smca_banks[bank].hwid->bank_type; if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) @@ -837,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_bank_names[bank_type].name, - smca_banks[bank].type_instance); + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].sysfs_id); return buf_mcatype; } @@ -955,6 +1187,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) const char *name = get_name(bank, NULL); int err = 0; + if (!dev) + return -ENODEV; + if (is_shared_bank(bank)) { nb = node_to_amd_nb(amd_get_nb_id(cpu)); @@ -1010,31 +1245,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -/* create dir/files for all valid threshold banks */ -static int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - struct threshold_bank **bp; - int err = 0; - - bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, - GFP_KERNEL); - if (!bp) - return -ENOMEM; - - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - return err; - } - - return err; -} - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -1102,48 +1312,71 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static void threshold_remove_device(unsigned int cpu) +int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; + if (!thresholding_en) + return 0; + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } kfree(per_cpu(threshold_banks, cpu)); + per_cpu(threshold_banks, cpu) = NULL; + return 0; } -/* get notified when a cpu comes on/off */ -static void -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +/* create dir/files for all valid threshold banks */ +int mce_threshold_create_device(unsigned int cpu) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + if (!thresholding_en) + return 0; + + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; + + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto err; } + return err; +err: + mce_threshold_remove_device(cpu); + return err; } static __init int threshold_init_device(void) { unsigned lcpu = 0; + if (mce_threshold_vector == amd_threshold_interrupt) + thresholding_en = true; + /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); + int err = mce_threshold_create_device(lcpu); if (err) return err; } - threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1defb8ea882c..190b3e6cef4d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,6 +11,8 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void) * Reset the counter if we've logged an error in the last poll * during the storm. */ - if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); else this_cpu_dec(cmci_backoff_cnt); @@ -342,7 +344,7 @@ void cmci_recheck(void) return; local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); } @@ -464,11 +466,46 @@ static void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN available but disabled: */ + return; + } + + /* If PPIN is disabled, but not locked, try to enable: */ + if (!(val & 3UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + if ((val & 3UL) == 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); + intel_ppin_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6b9dc4d18ccc..d7cc190ae457 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -6,7 +6,7 @@ * * Maintains a counter in /sys that keeps track of the number of thermal * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog and mcelog is rate limited). + * (since the logging to syslog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * @@ -26,7 +26,6 @@ #include <asm/processor.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -142,13 +141,8 @@ static struct attribute_group thermal_attr_group = { * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. - * - * Returns: 0 : Event should NOT be further logged, i.e. still in - * "timeout" from previous log message. - * 1 : Event should be logged further, and a message has been - * printed to the syslog. */ -static int therm_throt_process(bool new_event, int event, int level) +static void therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; unsigned int this_cpu = smp_processor_id(); @@ -163,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level) else if (event == POWER_LIMIT_EVENT) state = &pstate->core_power_limit; else - return 0; + return; } else if (level == PACKAGE_LEVEL) { if (event == THERMAL_THROTTLING_EVENT) state = &pstate->package_throttle; else if (event == POWER_LIMIT_EVENT) state = &pstate->package_power_limit; else - return 0; + return; } else - return 0; + return; old_event = state->new_event; state->new_event = new_event; @@ -182,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level) if (time_before64(now, state->next_check) && state->count != state->last_count) - return 0; + return; state->next_check = now + CHECK_INTERVAL; state->last_count = state->count; @@ -194,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - return 1; + return; } if (old_event) { if (event == THERMAL_THROTTLING_EVENT) pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - return 1; + return; } - - return 0; } static int thresh_event_valid(int level, int event) @@ -271,58 +263,32 @@ static void thermal_throttle_remove_dev(struct device *dev) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int thermal_throttle_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - int err = 0; - - dev = get_cpu_device(cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = thermal_throttle_add_dev(dev, cpu); - WARN_ON(err); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - thermal_throttle_remove_dev(dev); - break; - } - return notifier_from_errno(err); + struct device *dev = get_cpu_device(cpu); + + return thermal_throttle_add_dev(dev, cpu); } -static struct notifier_block thermal_throttle_cpu_notifier = +static int thermal_throttle_offline(unsigned int cpu) { - .notifier_call = thermal_throttle_cpu_callback, -}; + struct device *dev = get_cpu_device(cpu); + + thermal_throttle_remove_dev(dev); + return 0; +} static __init int thermal_throttle_init_device(void) { - unsigned int cpu = 0; - int err; + int ret; if (!atomic_read(&therm_throt_en)) return 0; - cpu_notifier_register_begin(); - - /* connect live CPUs to sysfs */ - for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); - WARN_ON(err); - } - - __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); - cpu_notifier_register_done(); - - return 0; + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; } device_initcall(thermal_throttle_init_device); @@ -392,10 +358,9 @@ static void intel_thermal_interrupt(void) /* Check for violation of core thermal thresholds*/ notify_thresholds(msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, @@ -431,14 +396,16 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fcf9ae9384f4..bb0e75eed10a 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -6,7 +6,6 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> @@ -24,14 +23,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage __visible void smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 220b1a508513..ba12e8aa4a45 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,4 +1,4 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 017bda12caae..7889ae492af0 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -5,6 +5,7 @@ * CPUs and later. * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2016 Borislav Petkov <bp@alien8.de> * * Author: Peter Oruba <peter.oruba@amd.com> * @@ -39,100 +40,149 @@ static struct equiv_cpu_entry *equiv_cpu_table; -struct ucode_patch { - struct list_head plist; - void *data; - u32 patch_id; - u16 equiv_cpu; -}; - -static LIST_HEAD(pcache); - /* * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. @mc is the + * microcode patch we found to match. */ -static u8 *container; -static size_t container_size; -static bool ucode_builtin; +struct cont_desc { + struct microcode_amd *mc; + u32 cpuid_1_eax; + u32 psize; + u8 *data; + size_t size; +}; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; -static struct cpio_data ucode_cpio; +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; -static struct cpio_data __init find_ucode_in_initrd(void) +static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig) { -#ifdef CONFIG_BLK_DEV_INITRD - char *path; - void *start; - size_t size; + for (; equiv_table && equiv_table->installed_cpu; equiv_table++) { + if (sig == equiv_table->installed_cpu) + return equiv_table->equiv_cpu; + } - /* - * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/early-microcode.txt - */ - static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + return 0; +} -#ifdef CONFIG_X86_32 - struct boot_params *p; +/* + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. Returns the equivalence ID from the equivalence + * table or 0 if none found. + * Returns the amount of bytes consumed while scanning. @desc contains all the + * data we're going to use in later stages of the application. + */ +static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc) +{ + struct equiv_cpu_entry *eq; + ssize_t orig_size = size; + u32 *hdr = (u32 *)ucode; + u16 eq_id; + u8 *buf; - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif /* !CONFIG_X86_32 */ + /* Am I looking at an equivalence table header? */ + if (hdr[0] != UCODE_MAGIC || + hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE || + hdr[2] == 0) + return CONTAINER_HDR_SZ; - return find_cpio_data(path, start, size, NULL); -#else - return (struct cpio_data){ NULL, 0, "" }; -#endif -} + buf = ucode; -static size_t compute_container_size(u8 *data, u32 total_size) -{ - size_t size = 0; - u32 *header = (u32 *)data; + eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ); - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; + /* Find the equivalence ID of our CPU in this table: */ + eq_id = find_equiv_id(eq, desc->cpuid_1_eax); - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; + buf += hdr[2] + CONTAINER_HDR_SZ; + size -= hdr[2] + CONTAINER_HDR_SZ; - while (total_size) { - u16 patch_size; + /* + * Scan through the rest of the container to find where it ends. We do + * some basic sanity-checking too. + */ + while (size > 0) { + struct microcode_amd *mc; + u32 patch_size; - header = (u32 *)data; + hdr = (u32 *)buf; - if (header[0] != UCODE_UCODE_TYPE) + if (hdr[0] != UCODE_UCODE_TYPE) break; - /* - * Sanity-check patch size. - */ - patch_size = header[1]; + /* Sanity-check patch size. */ + patch_size = hdr[1]; if (patch_size > PATCH_MAX_SIZE) break; - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; + /* Skip patch section header: */ + buf += SECTION_HDR_SIZE; + size -= SECTION_HDR_SIZE; + + mc = (struct microcode_amd *)buf; + if (eq_id == mc->hdr.processor_rev_id) { + desc->psize = patch_size; + desc->mc = mc; + } + + buf += patch_size; + size -= patch_size; } - return size; + /* + * If we have found a patch (desc->mc), it means we're looking at the + * container which has a patch for this CPU so return 0 to mean, @ucode + * already points to the proper container. Otherwise, we return the size + * we scanned so that we can advance to the next container in the + * buffer. + */ + if (desc->mc) { + desc->data = ucode; + desc->size = orig_size - size; + + return 0; + } + + return orig_size - size; +} + +/* + * Scan the ucode blob for the proper container as we can have multiple + * containers glued together. + */ +static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) +{ + ssize_t rem = size; + + while (rem >= 0) { + ssize_t s = parse_container(ucode, rem, desc); + if (!s) + return; + + ucode += s; + rem -= s; + } +} + +static int __apply_microcode_amd(struct microcode_amd *mc) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc->hdr.patch_id) + return -1; + + return 0; } /* @@ -143,128 +193,50 @@ static size_t compute_container_size(u8 *data, u32 total_size) * and on 32-bit during save_microcode_in_initrd_amd() -- we can call * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. + * + * Returns true if container found (sets @desc), false otherwise. */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +static bool +apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { - struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; + struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; - int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; + struct microcode_amd *mc; + u32 rev, dummy, *new_rev; + bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; patch = &amd_ucode_patch; #endif - data = ucode; - left = size; - header = (u32 *)data; - - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return; + desc.cpuid_1_eax = cpuid_1_eax; - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); + scan_containers(ucode, size, &desc); - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - - *cont = data; - - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; - - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); - - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = *cont_sz - offset; - break; - } + mc = desc.mc; + if (!mc) + return ret; - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev >= mc->hdr.patch_id) + return ret; - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; - } + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + ret = true; - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; + if (save_patch) + memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } - if (check_current_patch_level(&rev, true)) - return; - - while (left > 0) { - struct microcode_amd *mc; - - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; - - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { - - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; - - if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } + return ret; } -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { #ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -279,207 +251,113 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, #endif } -void __init load_ucode_amd_bsp(unsigned int family) +void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) { + struct ucode_cpu_info *uci; struct cpio_data cp; - bool *builtin; - void **data; - size_t *size; - -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); - builtin = (bool *)__pa_nodebug(&ucode_builtin); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; - builtin = &ucode_builtin; -#endif + const char *path; + bool use_pa; - *builtin = load_builtin_amd_microcode(&cp, family); - if (!*builtin) - cp = find_ucode_in_initrd(); + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } - if (!(cp.data && cp.size)) - return; + if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + cp = find_microcode_in_initrd(path, use_pa); - *data = cp.data; - *size = cp.size; + /* Needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_1_eax; - apply_ucode_in_initrd(cp.data, cp.size, true); + *ret = cp; } -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(void) +void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - size_t *usize; - void **ucode; - - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } + struct cpio_data cp = { }; - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); - - if (!*ucode || !*usize) + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) return; - apply_ucode_in_initrd(*ucode, *usize, false); + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); } -static void __init collect_cpu_sig_on_bsp(void *arg) +void load_ucode_amd_ap(unsigned int cpuid_1_eax) { - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); -} -#else -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u8 *cont = container; - u32 rev, eax; - u16 eq_id; - - /* Exit if called on the BSP. */ - if (!cpu) - return; - - if (!container) - return; - - /* - * 64-bit runs with paging enabled, thus early==false. - */ - if (check_current_patch_level(&rev, false)) - return; - - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; - - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); + struct cpio_data cp; + u32 *new_rev, rev, dummy; - eq_id = find_equiv_id(eq, eax); - if (!eq_id) - return; + if (IS_ENABLED(CONFIG_X86_32)) { + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + } else { + mc = (struct microcode_amd *)amd_ucode_patch; + new_rev = &ucode_new_rev; + } - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; + /* Check whether we have saved a new patch already: */ + if (*new_rev && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + return; } + } - } else { - if (!ucode_cpio.data) - return; + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) + return; - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); - } + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); } -#endif -int __init save_microcode_in_initrd_amd(void) +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); + +int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { - unsigned long cont; - int retval = 0; + struct cont_desc desc = { 0 }; enum ucode_state ret; - u8 *cont_va; - u32 eax; + struct cpio_data cp; - if (!container) + cp = find_microcode_in_initrd(ucode_path, false); + if (!(cp.data && cp.size)) return -EINVAL; -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa_nodebug(container); - cont_va = container; -#endif - - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; - - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + desc.cpuid_1_eax = cpuid_1_eax; - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), + desc.data, desc.size); if (ret != UCODE_OK) - retval = -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - container = NULL; - container_size = 0; + return -EINVAL; - return retval; + return 0; } void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev; + u32 rev, dummy; - /* - * early==false because this is a syscore ->resume path and by - * that time paging is long enabled. - */ - if (check_current_patch_level(&rev, false)) + mc = (struct microcode_amd *)amd_ucode_patch; + if (!mc) return; - mc = (struct microcode_amd *)amd_ucode_patch; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (mc && rev < mc->hdr.patch_id) { + if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; pr_info("reload patch_level=0x%08x\n", ucode_new_rev); @@ -513,7 +391,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) + list_for_each_entry(p, µcode_cache, plist) if (p->equiv_cpu == equiv_cpu) return p; return NULL; @@ -523,7 +401,7 @@ static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) { + list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { if (p->patch_id >= new_patch->patch_id) /* we already have the latest patch */ @@ -536,14 +414,14 @@ static void update_cache(struct ucode_patch *new_patch) } } /* no patch found, add it */ - list_add_tail(&new_patch->plist, &pcache); + list_add_tail(&new_patch->plist, µcode_cache); } static void free_cache(void) { struct ucode_patch *p, *tmp; - list_for_each_entry_safe(p, tmp, &pcache, plist) { + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { __list_del(p->plist.prev, p->plist.next); kfree(p->data); kfree(p); @@ -616,74 +494,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } -/* - * Those patch levels cannot be updated to newer ones and thus should be final. - */ -static u32 final_levels[] = { - 0x01000098, - 0x0100009f, - 0x010000af, - 0, /* T-101 terminator */ -}; - -/* - * Check the current patch level on this CPU. - * - * @rev: Use it to return the patch level. It is set to 0 in the case of - * error. - * - * Returns: - * - true: if update should stop - * - false: otherwise - */ -bool check_current_patch_level(u32 *rev, bool early) -{ - u32 lvl, dummy, i; - bool ret = false; - u32 *levels; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); - - if (IS_ENABLED(CONFIG_X86_32) && early) - levels = (u32 *)__pa_nodebug(&final_levels); - else - levels = final_levels; - - for (i = 0; levels[i]; i++) { - if (lvl == levels[i]) { - lvl = 0; - ret = true; - break; - } - } - - if (rev) - *rev = lvl; - - return ret; -} - -int __apply_microcode_amd(struct microcode_amd *mc_amd) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) - return -1; - - return 0; -} - -int apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; - u32 rev; + u32 rev, dummy; BUG_ON(raw_smp_processor_id() != cpu); @@ -696,8 +513,7 @@ int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - if (check_current_patch_level(&rev, false)) - return -1; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { @@ -860,7 +676,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5ce5155f0695..b4a4cd39b358 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * 2006 Shaohua Li <shaohua.li@intel.com> - * 2013-2015 Borislav Petkov <bp@alien8.de> + * 2013-2016 Borislav Petkov <bp@alien8.de> * * X86 CPU microcode early update for Linux: * @@ -39,11 +39,16 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/setup.h> -#define MICROCODE_VERSION "2.01" +#define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr; +static bool dis_ucode_ldr = true; + +bool initrd_gone; + +LIST_HEAD(microcode_cache); /* * Synchronization. @@ -61,15 +66,47 @@ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -/* - * Operations that are run on a target cpu: - */ - struct cpu_info_ctx { struct cpu_signature *cpu_sig; int err; }; +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +static bool amd_check_current_patch_level(void) +{ + u32 lvl, dummy, i; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32)) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) + return true; + } + return false; +} + static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; @@ -85,8 +122,24 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (cmdline_find_option_bool(cmdline, option)) - *res = true; + if (!have_cpuid_p()) + return *res; + + /* + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not + * completely accurate as xen pv guests don't see that CPUID bit set but + * that's good enough as they don't land on the BSP path anyway. + */ + if (native_cpuid_ecx(1) & BIT(31)) + return *res; + + if (x86_cpuid_vendor() == X86_VENDOR_AMD) { + if (amd_check_current_patch_level()) + return *res; + } + + if (cmdline_find_option_bool(cmdline, option) <= 0) + *res = false; return *res; } @@ -112,26 +165,21 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { - int vendor; - unsigned int family; + unsigned int cpuid_1_eax; if (check_loader_disabled_bsp()) return; - if (!have_cpuid_p()) - return; + cpuid_1_eax = native_cpuid_eax(1); - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); - - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_bsp(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_bsp(cpuid_1_eax); break; default: break; @@ -149,25 +197,21 @@ static bool check_loader_disabled_ap(void) void load_ucode_ap(void) { - int vendor, family; + unsigned int cpuid_1_eax; if (check_loader_disabled_ap()) return; - if (!have_cpuid_p()) - return; + cpuid_1_eax = native_cpuid_eax(1); - vendor = x86_cpuid_vendor(); - family = x86_cpuid_family(); - - switch (vendor) { + switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (family >= 6) + if (x86_family(cpuid_1_eax) >= 6) load_ucode_intel_ap(); break; case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_ap(); + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_ap(cpuid_1_eax); break; default: break; @@ -177,21 +221,81 @@ void load_ucode_ap(void) static int __init save_microcode_in_initrd(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + int ret = -EINVAL; switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - return save_microcode_in_initrd_intel(); + ret = save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; } - return -EINVAL; + initrd_gone = true; + + return ret; +} + +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Fixup the start address: after reserve_initrd() runs, initrd_start + * has the virtual address of the beginning of the initrd. It also + * possibly relocates the ramdisk. In either case, initrd_start contains + * the updated address so use that instead. + * + * initrd_gone is for the hotplug case where we've thrown out initrd + * already. + */ + if (!use_pa) { + if (initrd_gone) + return (struct cpio_data){ NULL, 0, "" }; + if (initrd_start) + start = initrd_start; + } + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif } void reload_early_microcode(void) @@ -453,16 +557,17 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - microcode_ops->microcode_fini_cpu(cpu); + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); } static enum ucode_state microcode_resume_cpu(int cpu) { - pr_debug("CPU%d updated upon resume\n", cpu); - if (apply_microcode_on_target(cpu)) return UCODE_ERROR; + pr_debug("CPU%d updated upon resume\n", cpu); + return UCODE_OK; } @@ -496,6 +601,9 @@ static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + /* Refresh CPU microcode revision after resume. */ + collect_cpu_info(cpu); + if (uci->valid) return microcode_resume_cpu(cpu); @@ -579,12 +687,7 @@ static int mc_cpu_down_prep(unsigned int cpu) /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); - /* - * When a CPU goes offline, don't free up or invalidate the copy of - * the microcode in kernel memory, so that we can reuse it when the - * CPU comes back online without unnecessarily requesting the userspace - * for it again. - */ + return 0; } @@ -649,8 +752,7 @@ int __init microcode_init(void) cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); return 0; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0deab00c9..8325d8a09ab0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,125 +39,83 @@ #include <asm/setup.h> #include <asm/msr.h> -/* - * Temporary microcode blobs pointers storage. We note here during early load - * the pointers to microcode blobs we've got from whatever storage (detached - * initrd, builtin). Later on, we put those into final storage - * mc_saved_data.mc_saved. - * - * Important: those are offsets from the beginning of initrd or absolute - * addresses within the kernel image when built-in. - */ -static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; - -static struct mc_saved_data { - unsigned int num_saved; - struct microcode_intel **mc_saved; -} mc_saved_data; +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Microcode blobs within the initrd. 0 if builtin. */ -static struct ucode_blobs { - unsigned long start; - bool valid; -} blobs; +/* Current microcode patch used in early patching on the APs. */ +struct microcode_intel *intel_ucode_patch; -/* Go through saved patches and find the one suitable for the current CPU. */ -static enum ucode_state -find_microcode_patch(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } + if (s1 != s2) + return false; - if (!new_mc) - return UCODE_NFOUND; + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; + /* ... or they intersect. */ + return p1 & p2; } -static inline void -copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, - unsigned long off, int num_saved) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int find_matching_signature(void *mc, unsigned int csig, int cpf) { + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; int i; - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) -{ - int i; - struct microcode_intel ***mc_saved; + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; - mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; - for (i = 0; i < mcs->num_saved; i++) { - struct microcode_intel *p; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; } + return 0; } -#endif -static enum ucode_state -load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long offset, struct ucode_cpu_info *uci) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mcs->num_saved; + struct microcode_header_intel *mc_hdr = mc; - if (!mcs->mc_saved) { - copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); + if (mc_hdr->rev <= new_rev) + return 0; - return find_microcode_patch(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mcs); - return find_microcode_patch(mc_saved_tmp, count, uci); -#else - return find_microcode_patch(mcs->mc_saved, count, uci); -#endif - } + return find_matching_signature(mc, csig, cpf); } /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. + * + * %true - if there's a match + * %false - otherwise */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) +static bool microcode_matches(struct microcode_header_intel *mc_header, + unsigned long sig) { - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; + struct extended_sigtable *ext_header; + unsigned int fam_ucode, model_ucode; struct extended_signature *ext_sig; + unsigned int fam, model; + int ext_sigcount, i; fam = x86_family(sig); model = x86_model(sig); @@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(mc_header->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; + return false; ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(ext_sig->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; ext_sig++; } - return UCODE_NFOUND; + return false; } -static int -save_microcode(struct mc_saved_data *mcs, - struct microcode_intel **mc_saved_src, - unsigned int num_saved) +static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { - int i, j; - struct microcode_intel **saved_ptr; - int ret; + struct ucode_patch *p; - if (!num_saved) - return -EINVAL; + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < num_saved; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return ERR_PTR(-ENOMEM); + } + + return p; +} + +static void save_microcode_patch(void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (find_matching_signature(data, sig, pf)) { + prev_found = true; - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; - saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer %p\n", data); + else + list_replace(&iter->plist, &p->plist); } } /* - * Point to newly saved microcode. + * There weren't any previous patches found in the list cache; save the + * newly found. */ - mcs->mc_saved = saved_ptr; - mcs->num_saved = num_saved; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; + if (!prev_found) { + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } } -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) +static int microcode_sanity_check(void *mc, int print_err) { - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format.\n"); + return -EINVAL; + } - found = 1; + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; /* - * Found an older ucode saved earlier. Replace it with - * this newer one. + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; } - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + if (!ext_table_size) + return 0; - return num_saved; + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; } /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. */ -static enum ucode_state __init -get_matching_model_microcode(unsigned long start, void *data, size_t size, - struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci) +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; struct microcode_header_intel *mc_header; - unsigned int num_saved = mcs->num_saved; - enum ucode_state state = UCODE_OK; - unsigned int leftover = size; - u8 *ucode_ptr = data; + struct microcode_intel *patch = NULL; unsigned int mc_size; - int i; - - while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { - if (leftover < sizeof(mc_header)) + while (size) { + if (size < sizeof(struct microcode_header_intel)) break; - mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_header = (struct microcode_header_intel *)data; mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) + if (!mc_size || + mc_size > size || + microcode_sanity_check(data, 0) < 0) break; - leftover -= mc_size; + size -= mc_size; - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { - ucode_ptr += mc_size; + if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + data += mc_size; continue; } - num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); + if (save) { + save_microcode_patch(data, mc_size); + goto next; + } - ucode_ptr += mc_size; - } - if (leftover) { - state = UCODE_ERROR; - return state; - } + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; - if (!num_saved) { - state = UCODE_NFOUND; - return state; - } + } else { + struct microcode_header_intel *phdr = &patch->hdr; + + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } - for (i = 0; i < num_saved; i++) - mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + /* We have a newer patch, save it. */ + patch = data; - mcs->num_saved = num_saved; +next: + data += mc_size; + } + + if (size) + return NULL; - return state; + return patch; } static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; unsigned int family, model; - struct cpu_signature csig; + struct cpu_signature csig = { 0 }; unsigned int eax, ebx, ecx, edx; - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - memset(uci, 0, sizeof(*uci)); eax = 0x00000001; @@ -374,23 +382,16 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - family = x86_family(csig.sig); - model = x86_model(csig.sig); + family = x86_family(eax); + model = x86_model(eax); if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; + csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; uci->valid = 1; @@ -401,40 +402,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) static void show_saved_mc(void) { #ifdef DEBUG - int i, j; + int i = 0, j; unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; + struct ucode_patch *p; - if (!mc_saved_data.num_saved) { + if (list_empty(µcode_cache)) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.num_saved; i++) { + list_for_each_entry(p, µcode_cache, plist) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; - int ext_sigcount; struct extended_signature *ext_sig; + int ext_sigcount; - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; + mc_saved_header = (struct microcode_header_intel *)p->data; + + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + date = mc_saved_header->date; + + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, + i++, sig, pf, rev, total_size, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -443,7 +445,7 @@ static void show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -456,85 +458,43 @@ static void show_saved_mc(void) ext_sig++; } - } #endif } /* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. + * Save this microcode patch. It will be loaded early when a CPU is + * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc, unsigned int size) { #ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int num_saved; - struct microcode_intel **mc_saved; - int ret, i; - mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.num_saved; - num_saved = mc_saved_data.num_saved; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && num_saved) - memcpy(mc_saved_tmp, mc_saved, - num_saved * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - num_saved = _save_mc(mc_saved_tmp, mc, num_saved); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - + save_microcode_patch(mc, size); show_saved_mc(); - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: mutex_unlock(&x86_cpu_microcode_mutex); #endif } -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +static bool load_builtin_intel_microcode(struct cpio_data *cp) { -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int eax = 1, ebx, ecx = 0, edx; char name[30]; + if (IS_ENABLED(CONFIG_X86_32)) + return false; + native_cpuid(&eax, &ebx, &ecx, &edx); sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); return get_builtin_firmware(cp, name); -#else - return false; -#endif } /* @@ -570,8 +530,7 @@ void show_ucode_info_early(void) } /* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in + * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ static void print_ucode(struct ucode_cpu_info *uci) @@ -616,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - unsigned int val[2]; + u32 rev; mc = uci->mc; if (!mc) @@ -624,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc->hdr.rev) + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 /* Flush global tlb. This is precaution. */ flush_tlb_early(); #endif - uci->cpu_sig.rev = val[1]; + uci->cpu_sig.rev = rev; if (early) print_ucode(uci); @@ -648,206 +602,133 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return 0; } -/* - * This function converts microcode patch offsets previously stored in - * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. - */ int __init save_microcode_in_initrd_intel(void) { - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data.num_saved; - unsigned long offset = 0; - int ret; + struct ucode_cpu_info uci; + struct cpio_data cp; + + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); - if (!count) + if (!(cp.data && cp.size)) return 0; - /* - * We have found a valid initrd but it might've been relocated in the - * meantime so get its updated address. - */ - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) - offset = initrd_start; + collect_cpu_info_early(&uci); - copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); + scan_microcode(cp.data, cp.size, &uci, true); - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - else - show_saved_mc(); + show_saved_mc(); - return ret; + return 0; } -static __init enum ucode_state -__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) { -#ifdef CONFIG_BLK_DEV_INITRD - static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; - char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) - : ucode_name; -# ifdef CONFIG_X86_32 - unsigned long start = 0, size; - struct boot_params *params; - - params = (struct boot_params *)__pa_nodebug(&boot_params); - size = params->hdr.ramdisk_size; - - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? params->hdr.ramdisk_image : 0); - -# else /* CONFIG_X86_64 */ - unsigned long start = 0, size; - - size = (u64)boot_params.ext_ramdisk_size << 32; - size |= boot_params.hdr.ramdisk_size; - - if (size) { - start = (u64)boot_params.ext_ramdisk_image << 32; - start |= boot_params.hdr.ramdisk_image; + static const char *path; + struct cpio_data cp; + bool use_pa; - start += PAGE_OFFSET; + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; } -# endif - - *cd = find_cpio_data(p, (void *)start, size, NULL); - if (cd->data) { - blbp->start = start; - blbp->valid = true; - return UCODE_OK; - } else -#endif /* CONFIG_BLK_DEV_INITRD */ - return UCODE_ERROR; -} + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci, struct ucode_blobs *blbp) -{ - struct cpio_data cd = { NULL, 0, "" }; - enum ucode_state ret; + if (!(cp.data && cp.size)) + return NULL; - /* try built-in microcode first */ - if (load_builtin_intel_microcode(&cd)) - /* - * Invalidate blobs as we might've gotten an initrd too, - * supplied by the boot loader, by mistake or simply forgotten - * there. That's fine, we ignore it since we've found builtin - * microcode already. - */ - blbp->valid = false; - else { - ret = __scan_microcode_initrd(&cd, blbp); - if (ret != UCODE_OK) - return ret; - } + collect_cpu_info_early(uci); - return get_matching_model_microcode(blbp->start, cd.data, cd.size, - mcs, mc_ptrs, uci); + return scan_microcode(cp.data, cp.size, uci, false); } -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_blobs *blbp) +void __init load_ucode_intel_bsp(void) { + struct microcode_intel *patch; struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); - if (ret != UCODE_OK) + patch = __load_ucode_intel(&uci); + if (!patch) return; - ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); - if (ret != UCODE_OK) - return; + uci.mc = patch; apply_microcode_early(&uci, true); } -void __init load_ucode_intel_bsp(void) +void load_ucode_intel_ap(void) { - struct ucode_blobs *blobs_p; - struct mc_saved_data *mcs; - unsigned long *ptrs; + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + +reget: + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } + + uci.mc = *iup; - _load_ucode_intel_bsp(mcs, ptrs, blobs_p); + if (apply_microcode_early(&uci, true)) { + /* Mixed-silicon system? Try to refetch the proper patch: */ + *iup = NULL; + + goto reget; + } } -void load_ucode_intel_ap(void) +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) { - struct ucode_blobs *blobs_p; - unsigned long *ptrs, start = 0; - struct mc_saved_data *mcs; - struct ucode_cpu_info uci; - enum ucode_state ret; - -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (!mcs->num_saved) - return; + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - if (blobs_p->valid) { - start = blobs_p->start; + phdr = (struct microcode_header_intel *)iter->data; - /* - * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles - * physmem mapping too and there we have the initrd. - */ - start += PAGE_OFFSET - __PAGE_OFFSET_BASE; - } + if (phdr->rev <= uci->cpu_sig.rev) + continue; - collect_cpu_info_early(&uci); - ret = load_microcode(mcs, ptrs, start, &uci); - if (ret != UCODE_OK) - return; + if (!find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; - apply_microcode_early(&uci, true); + return iter->data; + } + return NULL; } void reload_ucode_intel(void) { + struct microcode_intel *p; struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.num_saved) - return; collect_cpu_info_early(&uci); - ret = find_microcode_patch(mc_saved_data.mc_saved, - mc_saved_data.num_saved, &uci); - if (ret != UCODE_OK) + p = find_patch(&uci); + if (!p) return; + uci.mc = p; + apply_microcode_early(&uci, false); } @@ -879,31 +760,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -/* - * return 0 - no update found - * return 1 - found update - */ -static int get_matching_mc(struct microcode_intel *mc, int cpu) -{ - struct cpu_signature cpu_sig; - unsigned int csig, cpf, crev; - - collect_cpu_info(cpu, &cpu_sig); - - csig = cpu_sig.sig; - cpf = cpu_sig.pf; - crev = cpu_sig.rev; - - return has_newer_microcode(mc, csig, cpf, crev); -} - static int apply_microcode_intel(int cpu) { struct microcode_intel *mc; struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; - unsigned int val[2]; static int prev_rev; + u32 rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -911,46 +774,37 @@ static int apply_microcode_intel(int cpu) uci = ucode_cpu_info + cpu; mc = uci->mc; - if (!mc) - return 0; - - /* - * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc when it is newer than the one on this - * CPU. - */ - if (!get_matching_mc(mc, cpu)) - return 0; + if (!mc) { + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) + return 0; + } /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + rev = intel_get_microcode_revision(); - if (val[1] != mc->hdr.rev) { + if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); return -1; } - if (val[1] != prev_rev) { + if (rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", - val[1], + rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, (mc->hdr.date >> 16) & 0xff); - prev_rev = val[1]; + prev_rev = rev; } c = &cpu_data(cpu); - uci->cpu_sig.rev = val[1]; - c->microcode = val[1]; + uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } @@ -962,8 +816,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int curr_mc_size = 0; + unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; while (leftover) { @@ -1004,6 +857,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; + new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ } @@ -1015,14 +869,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (leftover) { vfree(new_mc); - state = UCODE_ERROR; - goto out; + return UCODE_ERROR; } - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } + if (!new_mc) + return UCODE_NFOUND; vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; @@ -1032,12 +883,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc); + save_mc_for_early(new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); -out: - return state; + + return UCODE_OK; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -1081,20 +932,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size) return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - vfree(uci->mc); - uci->mc = NULL; -} - static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, - .microcode_fini_cpu = microcode_fini_cpu, }; struct microcode_ops * __init init_intel_microcode(void) @@ -1109,4 +951,3 @@ struct microcode_ops * __init init_intel_microcode(void) return µcode_intel_ops; } - diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c deleted file mode 100644 index 406cb6c0d9dd..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ -#include <linux/firmware.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include <asm/microcode_intel.h> -#include <asm/processor.h> -#include <asm/msr.h> - -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - -int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) -{ - struct microcode_header_intel *mc_hdr = mc; - - if (mc_hdr->rev <= new_rev) - return 0; - - return find_matching_signature(mc, csig, cpf); -} diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f44c5a50ab8..b5375b9497b3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -25,12 +25,12 @@ #include <asm/hyperv.h> #include <asm/mshyperv.h> #include <asm/desc.h> -#include <asm/idle.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> #include <asm/timer.h> #include <asm/reboot.h> +#include <asm/nmi.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -133,33 +133,38 @@ static uint32_t __init ms_hyperv_platform(void) return 0; } -static cycle_t read_hv_clock(struct clocksource *arg) +static unsigned char hv_get_nmi_reason(void) { - cycle_t current_tick; - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - return current_tick; + return 0; } -static struct clocksource hyperv_cs = { - .name = "hyperv_clocksource", - .rating = 400, /* use this when running on Hyperv*/ - .read = read_hv_clock, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static unsigned char hv_get_nmi_reason(void) +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes + * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * unknown NMI on the first CPU which gets it. + */ +static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) { - return 0; + static atomic_t nmi_cpu = ATOMIC_INIT(-1); + + if (!unknown_nmi_panic) + return NMI_DONE; + + if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + return NMI_HANDLED; + + return NMI_DONE; } +#endif static void __init ms_hyperv_init_platform(void) { + int hv_host_info_eax; + int hv_host_info_ebx; + int hv_host_info_ecx; + int hv_host_info_edx; + /* * Extract the features and hints */ @@ -170,6 +175,21 @@ static void __init ms_hyperv_init_platform(void) pr_info("HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + /* + * Extract host information. + */ + if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) { + hv_host_info_eax = cpuid_eax(HVCPUID_VERSION); + hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION); + hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION); + hv_host_info_edx = cpuid_edx(HVCPUID_VERSION); + + pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", + hv_host_info_eax, hv_host_info_ebx >> 16, + hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, + hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); + } + #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { /* @@ -183,10 +203,10 @@ static void __init ms_hyperv_init_platform(void) pr_info("HyperV: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } -#endif - if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) - clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, + "hv_nmi_unknown"); +#endif #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; @@ -204,6 +224,13 @@ static void __init ms_hyperv_init_platform(void) */ if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Setup the hook to get control post apic initialization. + */ + x86_platform.apic_post_init = hyperv_init; +#endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 1db8dc490b66..d9794060fe22 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,11 +17,20 @@ struct cpuid_bit { u32 sub_leaf; }; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX +/* Please keep the leaf sorted by cpuid_bit.level for faster search. */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { 0, 0, 0, 0, 0 } }; void init_scattered_cpuid_features(struct cpuinfo_x86 *c) @@ -30,18 +39,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, - { 0, 0, 0, 0, 0 } - }; - for (cb = cpuid_bits; cb->feature; cb++) { /* Verify that the level is valid */ @@ -50,10 +47,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); } } + +u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, + enum cpuid_regs_idx reg) +{ + const struct cpuid_bit *cb; + u32 cpuid_val = 0; + + for (cb = cpuid_bits; cb->feature; cb++) { + + if (level > cb->level) + continue; + + if (level < cb->level) + break; + + if (reg == cb->reg && sub_leaf == cb->sub_leaf) { + if (cpu_has(&boot_cpu_data, cb->feature)) + cpuid_val |= BIT(cb->bit); + } + } + + return cpuid_val; +} +EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf); diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..8457b4978668 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,4 +1,6 @@ #include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/mm.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -14,6 +16,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } + + clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 5130985b758b..891f4dad7b2c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,11 +24,16 @@ #include <linux/dmi.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/clocksource.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> +#include <asm/timer.h> + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -48,6 +53,8 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +static unsigned long vmware_tsc_khz __ro_after_init; + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -57,35 +64,80 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz, lpj; - uint32_t eax, ebx, ecx, edx; + return vmware_tsc_khz; +} - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static int vmw_sched_clock __initdata = 1; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", - (unsigned long) tsc_hz / 1000, - (unsigned long) tsc_hz % 1000); - - if (!preset_lpj) { - lpj = ((u64)tsc_hz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; - } +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; - return tsc_hz; + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; } +static void __init vmware_sched_clock_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); +} + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) + vmware_sched_clock_setup(); +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ lapic_timer_frequency = ecx / HZ; @@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + vmware_paravirt_ops_setup(); + #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2836de390f95..0931a105ffe1 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -45,10 +45,7 @@ #include <asm/msr.h> static struct class *cpuid_class; - -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; +static enum cpuhp_state cpuhp_cpuid_state; static void cpuid_smp_cpuid(void *cmd_block) { @@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_device_create(int cpu) +static int cpuid_device_create(unsigned int cpu) { struct device *dev; @@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void cpuid_device_destroy(int cpu) +static int cpuid_device_destroy(unsigned int cpu) { device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; } -static int cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = cpuid_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - cpuid_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block cpuid_class_cpu_notifier = -{ - .notifier_call = cpuid_class_cpu_callback, -}; - static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); @@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) static int __init cpuid_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } cpuid_class = class_create(THIS_MODULE, "cpuid"); if (IS_ERR(cpuid_class)) { @@ -177,45 +149,28 @@ static int __init cpuid_init(void) } cpuid_class->devnode = cpuid_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = cpuid_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; - err = 0; - goto out; + cpuhp_cpuid_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) { - cpuid_device_destroy(i); - } - cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); -out: return err; } +module_init(cpuid_init); static void __exit cpuid_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - cpuid_device_destroy(cpu); + cpuhp_remove_state(cpuhp_cpuid_state); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(cpuid_init); module_exit(cpuid_exit); MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 650830e39e3a..3741461c63a0 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg) int crash_load_segments(struct kimage *image) { - unsigned long src_start, src_sz, elf_sz; - void *elf_addr; int ret; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; /* * Determine and load a segment for backup area. First 640K RAM @@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image) if (ret < 0) return ret; - src_start = image->arch.backup_src_start; - src_sz = image->arch.backup_src_sz; - /* Add backup segment. */ - if (src_sz) { + if (image->arch.backup_src_sz) { + kbuf.buffer = &crash_zero_bytes; + kbuf.bufsz = sizeof(crash_zero_bytes); + kbuf.memsz = image->arch.backup_src_sz; + kbuf.buf_align = PAGE_SIZE; /* * Ideally there is no source for backup segment. This is * copied in purgatory after crash. Just add a zero filled * segment for now to make sure checksum logic works fine. */ - ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, - sizeof(crash_zero_bytes), src_sz, - PAGE_SIZE, 0, -1, 0, - &image->arch.backup_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) return ret; + image->arch.backup_load_addr = kbuf.mem; pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, src_start, src_sz); + image->arch.backup_load_addr, + image->arch.backup_src_start, kbuf.memsz); } /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) return ret; - image->arch.elf_headers = elf_addr; - image->arch.elf_headers_sz = elf_sz; + image->arch.elf_headers = kbuf.buffer; + image->arch.elf_headers_sz = kbuf.bufsz; - ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, - ELF_CORE_HEADER_ALIGN, 0, -1, 0, - &image->arch.elf_load_addr); + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + ret = kexec_add_buffer(&kbuf); if (ret) { vfree((void *)image->arch.elf_headers); return ret; } + image->arch.elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, elf_sz, elf_sz); + image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 11891ca7b716..538fedea9b3f 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,7 +10,7 @@ #include <linux/highmem.h> #include <linux/crash_dump.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static void *kdump_buf_page; diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index f6dfd9334b67..f9c324e08d85 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,9 +1,10 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/init_task.h> #include <linux/fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 85f854b98a9d..09d4ac0d2661 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -10,6 +10,8 @@ #include <linux/kdebug.h> #include <linux/module.h> #include <linux/ptrace.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> #include <linux/ftrace.h> #include <linux/kexec.h> #include <linux/bug.h> @@ -22,7 +24,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -46,14 +47,7 @@ static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { touch_nmi_watchdog(); - printk("%s [<%p>] %s%pB\n", - log_lvl, (void *)address, reliable ? "" : "? ", - (void *)address); -} - -void printk_address(unsigned long address) -{ - pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -67,6 +61,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); /* * Iterate through the stacks, starting with the current stack pointer. @@ -82,8 +77,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (; stack; stack = stack_info.next_sp) { - const char *str_begin, *str_end; + for (regs = NULL; stack; stack = stack_info.next_sp) { + const char *stack_name; /* * If we overflowed the task stack into a guard page, jump back @@ -95,9 +90,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - stack_type_str(stack_info.type, &str_begin, &str_end); - if (str_begin) - printk("%s <%s> ", log_lvl, str_begin); + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); /* * Scan the stack, printing any text addresses we find. At the @@ -119,6 +114,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!__kernel_text_address(addr)) continue; + /* + * Don't print regs->ip again if it was already printed + * by __show_regs() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); + continue; + } + if (stack == ret_addr_p) reliable = 1; @@ -146,10 +150,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * of the addresses will just be printed as unreliable. */ unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) + __show_regs(regs, 0); } - if (str_end) - printk("%s <%s> ", log_lvl, str_end); + if (stack_name) + printk("%s </%s>\n", log_lvl, stack_name); } } @@ -164,12 +173,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, ""); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, ""); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -261,14 +270,11 @@ int __die(const char *str, struct pt_regs *regs, long err) sp = kernel_stack_pointer(regs); savesegment(ss, ss); } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", + (void *)regs->ip, ss, sp); #else /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip); - printk(" RSP <%016lx>\n", regs->sp); + printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); #endif return 0; } @@ -291,22 +297,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..b0b3a3df7c20 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -2,6 +2,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ +#include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -16,18 +17,15 @@ #include <asm/stacktrace.h> -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { - switch (type) { - case STACK_TYPE_IRQ: - case STACK_TYPE_SOFTIRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + return NULL; } static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) @@ -109,8 +107,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -121,36 +121,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..a8b117e93b46 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -2,6 +2,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ +#include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -28,23 +29,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - switch (type) { - case STACK_TYPE_IRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; - *end = "EOE"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; } static bool in_exception_stack(unsigned long *stack, struct stack_info *info) @@ -128,8 +123,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -140,56 +137,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" <EOI> "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +154,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90e8dde3ec26..b2bbad6ebe4d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -580,24 +580,19 @@ static void __init update_e820_saved(void) } #define MAX_GAP_END 0x100000000ull /* - * Search for a gap in the e820 memory space from start_addr to end_addr. + * Search for a gap in the e820 memory space from 0 to MAX_GAP_END. */ -__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr, unsigned long long end_addr) +static int __init e820_search_gap(unsigned long *gapstart, + unsigned long *gapsize) { - unsigned long long last; + unsigned long long last = MAX_GAP_END; int i = e820->nr_map; int found = 0; - last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; - while (--i >= 0) { unsigned long long start = e820->map[i].addr; unsigned long long end = start + e820->map[i].size; - if (end < start_addr) - continue; - /* * Since "last" is at most 4GB, we know we'll * fit in 32 bits if this condition is true @@ -628,18 +623,19 @@ __init void e820_setup_gap(void) unsigned long gapstart, gapsize; int found; - gapstart = 0x10000000; gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + found = e820_search_gap(&gapstart, &gapsize); -#ifdef CONFIG_X86_64 if (!found) { +#ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR "e820: cannot find a gap in the 32bit address range\n" "e820: PCI devices with unassigned 32bit BARs may break!\n"); - } +#else + gapstart = 0x10000000; #endif + } /* * e820_reserve_resources_late protect stolen RAM already diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index aad34aafc0e0..d913047f832c 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -23,17 +23,12 @@ static double __initdata y = 3145727.0; */ void __init fpu__init_check_bugs(void) { - u32 cr0_saved; s32 fdiv_bug; /* kernel_fpu_begin/end() relies on patched alternative instructions. */ if (!boot_cpu_has(X86_FEATURE_FPU)) return; - /* We might have CR0::TS set already, clear it: */ - cr0_saved = read_cr0(); - write_cr0(cr0_saved & ~X86_CR0_TS); - kernel_fpu_begin(); /* @@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void) kernel_fpu_end(); - write_cr0(cr0_saved); - if (fdiv_bug) { set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); pr_warn("Hmm, FPU with FDIV bug\n"); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ebb4e95fbd74..e1114f070c2d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void) return this_cpu_read(in_kernel_fpu); } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ static bool interrupted_kernel_fpu_idle(void) { - if (kernel_fpu_disabled()) - return false; - - if (use_eager_fpu()) - return true; - - return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); + return !kernel_fpu_disabled(); } /* @@ -125,8 +107,7 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); + __cpu_invalidate_fpregs_state(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -137,8 +118,6 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); kernel_fpu_enable(); } @@ -159,35 +138,6 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * CR0::TS save/restore functions: - */ -int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(irq_ts_save); - -void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} -EXPORT_SYMBOL_GPL(irq_ts_restore); - -/* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. @@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { - if (use_eager_fpu()) - copy_kernel_to_fpregs(&fpu->state); - else - fpregs_deactivate(fpu); + copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); @@ -231,13 +178,8 @@ void fpstate_init(union fpregs_state *state) memset(state, 0, fpu_kernel_xstate_size); - /* - * XRSTORS requires that this bit is set in xcomp_bv, or - * it will #GP. Make sure it is replaced after the memset(). - */ if (static_cpu_has(X86_FEATURE_XSAVES)) - state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; - + fpstate_init_xstate(&state->xsave); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else @@ -247,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -260,8 +201,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ - if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -283,10 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - if (use_eager_fpu()) - copy_kernel_to_fpregs(&src_fpu->state); - else - fpregs_deactivate(src_fpu); + copy_kernel_to_fpregs(&src_fpu->state); } preempt_enable(); @@ -366,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) if (fpu->fpstate_active) { /* Invalidate any lazy state: */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); @@ -409,7 +346,7 @@ void fpu__current_fpstate_write_begin(void) * ensures we will not be lazy and skip a XRSTOR in the * future. */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } /* @@ -459,7 +396,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); - fpu->counter++; trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } @@ -477,7 +413,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2f2b8c7ccb85..c2f8dde3255c 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -7,21 +7,10 @@ #include <asm/cmdline.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/init.h> /* - * Initialize the TS bit in CR0 according to the style of context-switches - * we are using: - */ -static void fpu__init_cpu_ctx_switch(void) -{ - if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) - stts(); - else - clts(); -} - -/* * Initialize the registers found in all CPUs, CR0 and CR4: */ static void fpu__init_cpu_generic(void) @@ -58,16 +47,9 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); - fpu__init_cpu_ctx_switch(); } -/* - * The earliest FPU detection code. - * - * Set the X86_FEATURE_FPU CPU-capability bit based on - * trying to execute an actual sequence of FPU instructions: - */ -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static bool fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -78,18 +60,25 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) cr0 &= ~(X86_CR0_TS | X86_CR0_EM); write_cr0(cr0); - if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw); - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID) && + !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + if (fpu__probe_without_cpuid()) + setup_force_cpu_cap(X86_FEATURE_FPU); else - clear_cpu_cap(c, X86_FEATURE_FPU); + setup_clear_cpu_cap(X86_FEATURE_FPU); } #ifndef CONFIG_MATH_EMULATION - if (!boot_cpu_has(X86_FEATURE_FPU)) { + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); @@ -233,82 +222,16 @@ static void __init fpu__init_system_xstate_size_legacy(void) } /* - * FPU context switching strategies: - * - * Against popular belief, we don't do lazy FPU saves, due to the - * task migration complications it brings on SMP - we only do - * lazy FPU restores. - * - * 'lazy' is the traditional strategy, which is based on setting - * CR0::TS to 1 during context-switch (instead of doing a full - * restore of the FPU state), which causes the first FPU instruction - * after the context switch (whenever it is executed) to fault - at - * which point we lazily restore the FPU state into FPU registers. - * - * Tasks are of course under no obligation to execute FPU instructions, - * so it can easily happen that another context-switch occurs without - * a single FPU instruction being executed. If we eventually switch - * back to the original task (that still owns the FPU) then we have - * not only saved the restores along the way, but we also have the - * FPU ready to be used for the original task. - * - * 'lazy' is deprecated because it's almost never a performance win - * and it's much more complicated than 'eager'. - * - * 'eager' switching is by default on all CPUs, there we switch the FPU - * state during every context switch, regardless of whether the task - * has used FPU instructions in that time slice or not. This is done - * because modern FPU context saving instructions are able to optimize - * state saving and restoration in hardware: they can detect both - * unused and untouched FPU state and optimize accordingly. - * - * [ Note that even in 'lazy' mode we might optimize context switches - * to use 'eager' restores, if we detect that a task is using the FPU - * frequently. See the fpu->counter logic in fpu/internal.h for that. ] - */ -static enum { ENABLE, DISABLE } eagerfpu = ENABLE; - -/* * Find supported xfeatures based on cpu features and command-line input. * This must be called after fpu__init_parse_early_param() is called and * xfeatures_mask is enumerated. */ u64 __init fpu__get_supported_xfeatures_mask(void) { - /* Support all xfeatures known to us */ - if (eagerfpu != DISABLE) - return XCNTXT_MASK; - - /* Warning of xfeatures being disabled for no eagerfpu mode */ - if (xfeatures_mask & XFEATURE_MASK_EAGER) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XFEATURE_MASK_EAGER); - } - - /* Return a mask that masks out all features requiring eagerfpu mode */ - return ~XFEATURE_MASK_EAGER; + return XCNTXT_MASK; } -/* - * Disable features dependent on eagerfpu. - */ -static void __init fpu__clear_eager_fpu_features(void) -{ - setup_clear_cpu_cap(X86_FEATURE_MPX); -} - -/* - * Pick the FPU context switching strategy: - * - * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of - * the following is true: - * - * (1) the cpu has xsaveopt, as it has the optimization and doing eager - * FPU switching has a relatively low cost compared to a plain xsave; - * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU - * switching. Should the kernel boot with noxsaveopt, we support MPX - * with eager FPU switching at a higher cost. - */ +/* Legacy code to initialize eager fpu mode. */ static void __init fpu__init_system_ctx_switch(void) { static bool on_boot_cpu __initdata = 1; @@ -317,17 +240,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - - if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XFEATURE_MASK_EAGER) - eagerfpu = ENABLE; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } /* @@ -336,11 +248,6 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { - eagerfpu = DISABLE; - fpu__clear_eager_fpu_features(); - } - if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -375,14 +282,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) */ fpu__init_cpu(); - /* - * But don't leave CR0::TS set yet, as some of the FPU setup - * methods depend on being able to execute FPU instructions - * that will fault on a set TS, such as the FXSAVE in - * fpu__init_system_mxcsr(). - */ - clts(); - fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c114b132d121..b188b16841e3 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,6 +5,7 @@ #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> +#include <linux/sched/task_stack.h> /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..83c23c230b4c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(fpu); - preempt_enable(); - } + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 095ef7ddd6ae..c24ac1efb12d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); @@ -73,9 +74,11 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -703,8 +706,14 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } @@ -890,15 +899,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* - * For most XSAVE components, this would be an arduous task: - * brining fpstate up to date with fpregs, updating fpstate, - * then re-populating fpregs. But, for components that are - * never lazily managed, we can just access the fpregs - * directly. PKRU is never managed lazily, so we can just - * manipulate it directly. Make sure it stays that way. - */ - WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index f16c55bfc090..e5fb436a6548 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void) start_kernel(); } + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. Note the upper half of each PMD or PTE are + * always zero at this stage. + */ +void __init mk_early_pgtbl_32(void) +{ +#ifdef __pa +#undef __pa +#endif +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) + pte_t pte, *ptep; + int i; + unsigned long *ptr; + /* Enough space to fit pagetables for the low memory linear map */ + const unsigned long limit = __pa(_end) + + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT); +#ifdef CONFIG_X86_PAE + pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd); +#define SET_PL2(pl2, val) { (pl2).pmd = (val); } +#else + pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table); +#define SET_PL2(pl2, val) { (pl2).pgd = (val); } +#endif + + ptep = (pte_t *)__pa(__brk_base); + pte.pte = PTE_IDENT_ATTR; + + while ((pte.pte & PTE_PFN_MASK) < limit) { + + SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR); + *pl2p = pl2; +#ifndef CONFIG_X86_PAE + /* Kernel PDE entry */ + *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2; +#endif + for (i = 0; i < PTRS_PER_PTE; i++) { + *ptep = pte; + pte.pte += PAGE_SIZE; + ptep++; + } + + pl2p++; + } + + ptr = (unsigned long *)__pa(&max_pfn_mapped); + /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */ + *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + + ptr = (unsigned long *)__pa(&_brk_end); + *ptr = (unsigned long)ptep + PAGE_OFFSET; +} + diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2dabea46f039..1f85ee8f9439 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -24,6 +24,7 @@ #include <asm/nops.h> #include <asm/bootparam.h> #include <asm/export.h> +#include <asm/pgtable_32.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -41,40 +42,8 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -/* - * This is how much memory in addition to the memory covered up to - * and including _end we need mapped initially. - * We need: - * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) - * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) - * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - * - * KERNEL_IMAGE_SIZE should be greater than pa(_end) - * and small than max_low_pfn, otherwise will waste some page table entries - */ -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif - -/* - * Number of possible pages in the lowmem region. - * - * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a - * gas warning about overflowing shift count when gas has been compiled - * with only a host target support using a 32-bit type for internal - * representation. - */ -LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT +#define SIZEOF_PTREGS 17*4 /* * Worst-case size of the kernel mapping we need to make: @@ -158,109 +127,34 @@ ENTRY(startup_32) call load_ucode_bsp #endif -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - */ -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) + /* Create early pagetables. */ + call mk_early_pgtbl_32 /* Do early initialization of the fixmap area */ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#ifdef CONFIG_X86_PAE +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#else movl %eax,pa(initial_page_table+0xffc) #endif #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) - jb default_entry + jb .Ldefault_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax - jae bad_subarch + jae .Lbad_subarch movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax -bad_subarch: +.Lbad_subarch: WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really @@ -270,14 +164,14 @@ WEAK(xen_entry) __INITDATA subarch_entries: - .long default_entry /* normal x86/PC */ + .long .Ldefault_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ + .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #else - jmp default_entry + jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_HOTPLUG_CPU @@ -289,7 +183,8 @@ num_subarch_entries = (. - subarch_entries) / 4 ENTRY(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp - jmp *(initial_code) + call *(initial_code) +1: jmp 1b ENDPROC(start_cpu0) #endif @@ -317,7 +212,7 @@ ENTRY(startup_32_smp) call load_ucode_ap #endif -default_entry: +.Ldefault_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ X86_CR0_PG) @@ -347,7 +242,7 @@ default_entry: pushfl popl %eax # get EFLAGS testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? - jz enable_paging # hw disallowed setting of ID bit + jz .Lenable_paging # hw disallowed setting of ID bit # which means no CPUID and no CR4 xorl %eax,%eax @@ -357,13 +252,13 @@ default_entry: movl $1,%eax cpuid andl $~1,%edx # Ignore CPUID.FPU - jz enable_paging # No flags or only CPUID.FPU = no CR4 + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz enable_paging + jz .Lenable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -371,7 +266,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja enable_paging + ja .Lenable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -380,7 +275,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc enable_paging + jnc .Lenable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -390,7 +285,7 @@ default_entry: /* Make changes effective */ wrmsr -enable_paging: +.Lenable_paging: /* * Enable paging @@ -419,7 +314,7 @@ enable_paging: */ movb $4,X86 # at least 486 cmpl $-1,X86_CPUID - je is486 + je .Lis486 /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -430,7 +325,7 @@ enable_paging: movl %ecx,X86_VENDOR_ID+8 # last 4 chars orl %eax,%eax # do we have processor info as well? - je is486 + je .Lis486 movl $1,%eax # Use the CPUID instruction to get CPU type cpuid @@ -444,7 +339,7 @@ enable_paging: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: +.Lis486: movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET @@ -470,8 +365,9 @@ is486: xorl %eax,%eax # Clear LDT lldt %ax - pushl $0 # fake return address for unwinder - jmp *(initial_code) + call *(initial_code) +1: jmp 1b +ENDPROC(startup_32_smp) #include "verify_cpu.S" @@ -662,6 +558,7 @@ ENTRY(setup_once_ref) __PAGE_ALIGNED_BSS .align PAGE_SIZE #ifdef CONFIG_X86_PAE +.globl initial_pg_pmd initial_pg_pmd: .fill 1024*KPMDS,4,0 #else @@ -709,7 +606,12 @@ ENTRY(initial_page_table) .data .balign 4 ENTRY(initial_stack) - .long init_thread_union+THREAD_SIZE + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ + TOP_OF_KERNEL_STACK_PADDING; __INITRODATA int_msg: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..b467b14b03eb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,13 +66,8 @@ startup_64: * tables and then reload them. */ - /* - * Setup stack for verify_cpu(). "-8" because initial_stack is defined - * this way, see below. Our best guess is a NULL ptr for stack - * termination heuristics and we don't want to break anything which - * might depend on it (kgdb, ...). - */ - leaq (__end_init_task - 8)(%rip), %rsp + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp /* Sanitize CPU configuration */ call verify_cpu @@ -117,20 +112,20 @@ startup_64: movq %rdi, %rax shrq $PGDIR_SHIFT, %rax - leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx movq %rdx, 0(%rbx,%rax,8) movq %rdx, 8(%rbx,%rax,8) - addq $4096, %rdx + addq $PAGE_SIZE, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) incl %eax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) - addq $8192, %rbx + addq $PAGE_SIZE * 2, %rbx movq %rdi, %rax shrq $PMD_SHIFT, %rdi addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax @@ -147,6 +142,9 @@ startup_64: decl %ecx jnz 1b + test %rbp, %rbp + jz .Lskip_fixup + /* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated @@ -154,9 +152,9 @@ startup_64: * beyond _end. */ leaq level2_kernel_pgt(%rip), %rdi - leaq 4096(%rdi), %r8 + leaq PAGE_SIZE(%rdi), %r8 /* See if it is a valid page table entry */ -1: testb $1, 0(%rdi) +1: testb $_PAGE_PRESENT, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ @@ -167,6 +165,7 @@ startup_64: /* Fixup phys_base */ addq %rbp, phys_base(%rip) +.Lskip_fixup: movq $(early_level4_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) @@ -265,13 +264,17 @@ ENTRY(secondary_startup_64) movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx - wrmsr + wrmsr /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - - /* Finally jump to run C code and to be on real kernel address + jmp start_cpu +ENDPROC(secondary_startup_64) + +ENTRY(start_cpu) + /* + * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this @@ -295,12 +298,14 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder + pushq $.Lafter_lret # put return address on stack for unwinder + xorq %rbp, %rbp # clear frame pointer + movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq -ENDPROC(secondary_startup_64) +.Lafter_lret: +ENDPROC(start_cpu) #include "verify_cpu.S" @@ -308,15 +313,11 @@ ENDPROC(secondary_startup_64) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary(). + * start_secondary() via start_cpu(). */ ENTRY(start_cpu0) - movq initial_stack(%rip),%rsp - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + movq initial_stack(%rip), %rsp + jmp start_cpu ENDPROC(start_cpu0) #endif @@ -328,7 +329,11 @@ ENDPROC(start_cpu0) GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) - .quad init_thread_union+THREAD_SIZE-8 + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA bad_address: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 274fab99169d..dc6ba5bda9fc 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -352,6 +352,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); @@ -791,7 +792,7 @@ static union hpet_lock hpet __cacheline_aligned = { { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, }; -static cycle_t read_hpet(struct clocksource *cs) +static u64 read_hpet(struct clocksource *cs) { unsigned long flags; union hpet_lock old, new; @@ -802,7 +803,7 @@ static cycle_t read_hpet(struct clocksource *cs) * Read HPET directly if in NMI. */ if (in_nmi()) - return (cycle_t)hpet_readl(HPET_COUNTER); + return (u64)hpet_readl(HPET_COUNTER); /* * Read the current state of the lock and HPET value atomically. @@ -821,7 +822,7 @@ static cycle_t read_hpet(struct clocksource *cs) WRITE_ONCE(hpet.value, new.value); arch_spin_unlock(&hpet.lock); local_irq_restore(flags); - return (cycle_t)new.value; + return (u64)new.value; } local_irq_restore(flags); @@ -843,15 +844,15 @@ contended: new.lockval = READ_ONCE(hpet.lockval); } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); - return (cycle_t)new.value; + return (u64)new.value; } #else /* * For UP or 32-bit. */ -static cycle_t read_hpet(struct clocksource *cs) +static u64 read_hpet(struct clocksource *cs) { - return (cycle_t)hpet_readl(HPET_COUNTER); + return (u64)hpet_readl(HPET_COUNTER); } #endif @@ -867,7 +868,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; - cycle_t t1; + u64 t1; /* Start the counter */ hpet_restart_counter(); @@ -1051,11 +1052,11 @@ static __init int hpet_late_init(void) return 0; /* This notifier should be called after workqueue is ready */ - ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE", + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", hpet_cpuhp_online, NULL); if (ret) return ret; - ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL, + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL, hpet_cpuhp_dead); if (ret) goto err_cpuhp; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 589b3193f102..ca49bab3e467 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/errno.h> @@ -16,6 +17,7 @@ #include <linux/syscalls.h> #include <linux/bitmap.h> #include <asm/syscalls.h> +#include <asm/desc.h> /* * this changes the io permissions bitmap in the current task. @@ -45,6 +47,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; set_thread_flag(TIF_IO_BITMAP); + + preempt_disable(); + refresh_TR(); + preempt_enable(); } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..4d8183b5f113 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -14,7 +14,6 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> @@ -265,7 +264,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -__visible void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -316,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) } #endif -__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 9ebd0b0e73d9..3be74fbdeff2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -15,8 +15,8 @@ #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/smp.h> +#include <linux/sched/task_stack.h> #include <asm/io_apic.h> -#include <asm/idle.h> #include <asm/apic.h> int sysctl_panic_on_stackoverflow; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3512ba607361..275487872be2 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/trace/irq_vectors.h> +#include <linux/interrupt.h> static inline void __smp_irq_work_interrupt(void) { @@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -__visible void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000000..f73f475d0573 --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,213 @@ +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/nodemask.h> + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static unsigned int zero; +static unsigned int one = 1; +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = &zero, + .extra2 = &one, + }, + {} +}; + +static struct ctl_table itmt_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = itmt_kern_table, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl_table(itmt_root_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + x86_topology_update = true; + rebuild_sched_domains(); + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of cpu core + * @core_cpu: The cpu number associated with the core + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPU with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int core_cpu) +{ + int cpu, i = 1; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + int smt_prio; + + /* + * Ensure that the siblings are moved to the end + * of the priority chain and only used when + * all other high priority cpus are out of capacity. + */ + smt_prio = prio * smp_num_siblings / i; + per_cpu(sched_core_priority, cpu) = smt_prio; + i++; + } +} diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index fc25f698d792..c37bd0f39c70 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line) * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ - pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", - ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); BUG(); } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 3407b148c240..d0a814a9d96a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel, struct setup_header *header; int setup_sects, kern16_size, ret = 0; - unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; unsigned long purgatory_load_addr; - unsigned long kernel_bufsz, kernel_memsz, kernel_align; - char *kernel_buf; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - params_misc_sz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + efi_map_sz + sizeof(struct setup_data) + sizeof(struct efi_setup_data); - params = kzalloc(params_misc_sz, GFP_KERNEL); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel, /* Is there a limit on setup header size? */ memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); - ret = kexec_add_buffer(image, (char *)params, params_misc_sz, - params_misc_sz, 16, MIN_BOOTPARAM_ADDR, - ULONG_MAX, 1, &bootparam_load_addr); + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + bootparam_load_addr = kbuf.mem; pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - bootparam_load_addr, params_misc_sz, params_misc_sz); + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); /* Load kernel */ - kernel_buf = kernel + kern16_size; - kernel_bufsz = kernel_len - kern16_size; - kernel_memsz = PAGE_ALIGN(header->init_size); - kernel_align = header->kernel_alignment; - - ret = kexec_add_buffer(image, kernel_buf, - kernel_bufsz, kernel_memsz, kernel_align, - MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, - &kernel_load_addr); + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + kernel_load_addr = kbuf.mem; pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_load_addr, kernel_memsz, kernel_memsz); + kernel_load_addr, kbuf.bufsz, kbuf.memsz); /* Load initrd high */ if (initrd) { - ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, - PAGE_SIZE, MIN_INITRD_LOAD_ADDR, - ULONG_MAX, 1, &initrd_load_addr); + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", initrd_load_addr, initrd_len, initrd_len); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d9d8d16b69db..6384eb754a58 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,6 +45,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> +#include <linux/sched/debug.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> @@ -56,7 +57,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> @@ -745,7 +746,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) * will be the real return address, and all the rest will * point to kretprobe_trampoline. */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { + hlist_for_each_entry(ri, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3bb4c5f021f6..3d1bee9d6a72 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -33,7 +33,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..14f65a5f938e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,7 +42,6 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> -#include <asm/idle.h> #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> @@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); rcu_irq_exit(); break; @@ -308,7 +305,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { /** * This relies on __test_and_clear_bit to modify the memory @@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic_write(APIC_EOI, APIC_EOI_ACK); + apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); } static void kvm_guest_cpu_init(void) @@ -592,6 +589,38 @@ out: local_irq_restore(flags); } +#ifdef CONFIG_X86_32 +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!src->preempted; +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +"ret;" +".popsection"); + +#endif + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -608,20 +637,11 @@ void __init kvm_spinlock_init(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -} -static __init int kvm_spinlock_init_jump(void) -{ - if (!kvm_para_available()) - return 0; - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - printk(KERN_INFO "KVM setup paravirtual spinlock\n"); - - return 0; + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + pv_lock_ops.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } } -early_initcall(kvm_spinlock_init_jump); #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 60b9949f1e65..d88967659098 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -25,14 +25,16 @@ #include <linux/hardirq.h> #include <linux/memblock.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/kvmclock.h> static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static cycle_t kvm_sched_clock_offset; +static u64 kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -49,6 +51,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { return hv_clock; } +EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -79,10 +82,10 @@ static int kvm_set_wallclock(const struct timespec *now) return -1; } -static cycle_t kvm_clock_read(void) +static u64 kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; - cycle_t ret; + u64 ret; int cpu; preempt_disable_notrace(); @@ -93,12 +96,12 @@ static cycle_t kvm_clock_read(void) return ret; } -static cycle_t kvm_clock_get_cycles(struct clocksource *cs) +static u64 kvm_clock_get_cycles(struct clocksource *cs) { return kvm_clock_read(); } -static cycle_t kvm_sched_clock_read(void) +static u64 kvm_sched_clock_read(void) { return kvm_clock_read() - kvm_sched_clock_offset; } @@ -107,12 +110,12 @@ static inline void kvm_sched_clock_init(bool stable) { if (!stable) { pv_time_ops.sched_clock = kvm_clock_read; + clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", kvm_sched_clock_offset); @@ -174,13 +177,14 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -static struct clocksource kvm_clock = { +struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +EXPORT_SYMBOL_GPL(kvm_clock); int kvm_register_clock(char *txt) { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6707039b9032..d4a15831ac58 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm) } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int size) { struct ldt_struct *new_ldt; - int alloc_size; + unsigned int alloc_size; if (size > LDT_ENTRIES) return NULL; @@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) paravirt_free_ldt(ldt->entries, ldt->size); if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); + vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); @@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int oldsize, newsize; + struct user_desc ldt_info; struct desc_struct ldt; int error; - struct user_desc ldt_info; - int oldsize, newsize; - struct ldt_struct *new_ldt, *old_ldt; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) old_ldt = mm->context.ldt; oldsize = old_ldt ? old_ldt->size : 0; - newsize = max((int)(ldt_info.entry_number + 1), oldsize); + newsize = max(ldt_info.entry_number + 1, oldsize); error = -ENOMEM; new_ldt = alloc_ldt_struct(newsize); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { - VMCOREINFO_SYMBOL(phys_base); + VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA @@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); - VMCOREINFO_VMALLOC_START(VMALLOC_START); - VMCOREINFO_VMEMMAP_START(VMEMMAP_START); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7f3550acde1b..ef688804f80d 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -44,6 +44,7 @@ #include <asm/msr.h> static struct class *msr_class; +static enum cpuhp_state cpuhp_msr_state; static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -180,7 +181,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int msr_device_create(int cpu) +static int msr_device_create(unsigned int cpu) { struct device *dev; @@ -189,34 +190,12 @@ static int msr_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void msr_device_destroy(int cpu) +static int msr_device_destroy(unsigned int cpu) { device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; } -static int msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = msr_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - msr_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block __refdata msr_class_cpu_notifier = { - .notifier_call = msr_class_cpu_callback, -}; - static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); @@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode) static int __init msr_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { pr_err("unable to get major %d for msr\n", MSR_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } msr_class = class_create(THIS_MODULE, "msr"); if (IS_ERR(msr_class)) { @@ -239,44 +216,27 @@ static int __init msr_init(void) } msr_class->devnode = msr_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = msr_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); - - err = 0; - goto out; + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) - msr_device_destroy(i); - cpu_notifier_register_done(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); -out: return err; } +module_init(msr_init); static void __exit msr_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - msr_device_destroy(cpu); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - __unregister_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(msr_init); module_exit(msr_exit) MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bfe4d6c96fbd..f088ea4c66e7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/debugfs.h> #include <linux/delay.h> @@ -20,6 +21,7 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 2c55a003b793..8f2d1c9d43a8 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); } - PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) @@ -21,15 +20,25 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } +__visible bool __native_vcpu_is_preempted(long cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_lock_ops.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, + .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); - -struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bbf3d5933eaa..4797e87b0fb6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, - .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, @@ -426,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, + .set_pud_at = native_set_pud_at, .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 920c6ae08592..553acbbb4d32 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { @@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: __maybe_unused ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index bb3840cedb4f..11aaf1eaa0e4 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) @@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: __maybe_unused ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5d400ba1349d..0c150c06fa5a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -296,7 +296,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + if (unlikely(dma_addr < badend)) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -478,7 +478,7 @@ static void calgary_free_coherent(struct device *dev, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } -static struct dma_map_ops calgary_dma_ops = { +static const struct dma_map_ops calgary_dma_ops = { .alloc = calgary_alloc_coherent, .free = calgary_free_coherent, .map_sg = calgary_map_sg, @@ -1177,7 +1177,7 @@ static int __init calgary_init(void) tbl = find_iommu_table(&dev->dev); if (translation_enabled(tbl)) - dev->dev.archdata.dma_ops = &calgary_dma_ops; + dev->dev.dma_ops = &calgary_dma_ops; } return ret; @@ -1201,7 +1201,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.archdata.dma_ops = NULL; + dev->dev.dma_ops = NULL; } while (1); return ret; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d30c37750765..3a216ec869cd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -17,7 +17,7 @@ static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops = &nommu_dma_ops; +const struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -91,7 +91,8 @@ again: page = NULL; /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(flag)) { - page = dma_alloc_from_contiguous(dev, count, get_order(size)); + page = dma_alloc_from_contiguous(dev, count, get_order(size), + flag); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); page = NULL; @@ -214,7 +215,7 @@ early_param("iommu", iommu_setup); int dma_supported(struct device *dev, u64 mask) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 00e71ce396a8..a88952ef371c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -88,7 +88,7 @@ static void nommu_sync_sg_for_device(struct device *dev, flush_write_buffers(); } -struct dma_map_ops nommu_dma_ops = { +const struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index b47edb8f5256..1e23577e17cf 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size, dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } -static struct dma_map_ops swiotlb_dma_ops = { +static const struct dma_map_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc = x86_swiotlb_alloc_coherent, .free = x86_swiotlb_free_coherent, @@ -68,12 +68,10 @@ static struct dma_map_ops swiotlb_dma_ops = { */ int __init pci_swiotlb_detect_override(void) { - int use_swiotlb = swiotlb | swiotlb_force; - - if (swiotlb_force) + if (swiotlb_force == SWIOTLB_FORCE) swiotlb = 1; - return use_swiotlb; + return swiotlb; } IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_xen_swiotlb_detect, diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index da8cb987b973..587d887f7f17 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -1,6 +1,7 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/perf_event.h> #include <linux/bug.h> #include <linux/stddef.h> diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 24a50301f150..91271122f0df 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -6,6 +6,7 @@ void __init x86_early_init_platform_quirks(void) { + x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; x86_platform.legacy.rtc = 1; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; @@ -16,10 +17,14 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; case X86_SUBARCH_INTEL_MID: case X86_SUBARCH_CE4100: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; break; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0888a879120f..56b059486c3b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,10 @@ #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/idle.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/init.h> #include <linux/export.h> #include <linux/pm.h> @@ -23,8 +27,7 @@ #include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> -#include <asm/idle.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> #include <asm/debugreg.h> @@ -33,6 +36,7 @@ #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> +#include <asm/desc.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -65,22 +69,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -#ifdef CONFIG_X86_64 -static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); -#endif +DEFINE_PER_CPU(bool, need_tr_refresh); +EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); /* * this gets called so that we can store lazy state into memory and copy the @@ -227,6 +217,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); + + /* + * Make sure that the TSS limit is correct for the CPU + * to notice the IO bitmap. + */ + refresh_TR(); } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { /* * Clear any possible leftover bits: @@ -251,39 +247,10 @@ static inline void play_dead(void) } #endif -#ifdef CONFIG_X86_64 -void enter_idle(void) -{ - this_cpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} -#endif - void arch_cpu_idle_enter(void) { + tsc_verify_tsc_adjust(false); local_touch_nmi(); - enter_idle(); -} - -void arch_cpu_idle_exit(void) -{ - __exit_idle(); } void arch_cpu_idle_dead(void) @@ -336,59 +303,33 @@ void stop_this_cpu(void *dummy) halt(); } -bool amd_e400_c1e_detected; -EXPORT_SYMBOL(amd_e400_c1e_detected); - -static cpumask_var_t amd_e400_c1e_mask; - -void amd_e400_remove_cpu(int cpu) -{ - if (amd_e400_c1e_mask != NULL) - cpumask_clear_cpu(cpu, amd_e400_c1e_mask); -} - /* - * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt - * pending message MSR. If we detect C1E, then we handle it the same - * way as C3 power states (local apic timer and TSC stop) + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). */ static void amd_e400_idle(void) { - if (!amd_e400_c1e_detected) { - u32 lo, hi; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - amd_e400_c1e_detected = true; - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) - mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); - } + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; } - if (amd_e400_c1e_detected) { - int cpu = smp_processor_id(); - - if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { - cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* Force broadcast so ACPI can not interfere. */ - tick_broadcast_force(); - pr_info("Switch to broadcast mode on CPU%d\n", cpu); - } - tick_broadcast_enter(); + tick_broadcast_enter(); - default_idle(); + default_idle(); - /* - * The switch back from broadcast mode needs to be - * called with interrupts disabled. - */ - local_irq_disable(); - tick_broadcast_exit(); - local_irq_enable(); - } else - default_idle(); + /* + * The switch back from broadcast mode needs to be called with + * interrupts disabled. + */ + local_irq_disable(); + tick_broadcast_exit(); + local_irq_enable(); } /* @@ -448,8 +389,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c) if (x86_idle || boot_option_idle_override == IDLE_POLL) return; - if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { - /* E400: APIC timer interrupt does not wake up CPU from C1e */ + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; } else if (prefer_mwait_c1_over_halt(c)) { @@ -459,11 +399,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c) x86_idle = default_idle; } -void __init init_amd_e400_c1e_mask(void) +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) { - /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (x86_idle == amd_e400_idle) - zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..4c818f8bc135 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -12,6 +12,8 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -49,11 +51,11 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> +#include <asm/intel_rdt.h> void __show_regs(struct pt_regs *regs, int all) { @@ -72,10 +74,9 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - (u16)regs->cs, regs->ip, regs->flags, - smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); + printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, + smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -232,11 +233,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -295,9 +295,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); this_cpu_write(current_task, next_p); + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..d6b784a5520d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -17,6 +17,8 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -44,12 +46,12 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> +#include <asm/intel_rdt.h> __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -61,10 +63,15 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, - regs->sp, regs->flags); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, + (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, + regs->sp, regs->flags); + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -265,9 +272,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned prev_fsindex, prev_gsindex; - fpu_switch_t fpu_switch; - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -417,7 +423,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->gsbase = 0; prev->gsindex = prev_gsindex; - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); /* * Switch the PDA and FPU contexts. @@ -473,6 +479,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) loadsegment(ss, __KERNEL_DS); } + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0e63c0267f99..2364b23ea3e5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -24,7 +25,7 @@ #include <linux/export.h> #include <linux/context_tracking.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/fpu/internal.h> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5b2cc889ce34..5c3f6d6a5078 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -21,6 +21,8 @@ #include <linux/sched.h> #include <linux/gfp.h> #include <linux/bootmem.h> +#include <linux/nmi.h> + #include <asm/fixmap.h> #include <asm/pvclock.h> @@ -71,10 +73,10 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) return flags & valid_flags; } -cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; + u64 ret; u64 last; u8 flags; diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 79c6311cd912..5b21cb7d84d6 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now) unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; + /* + * If pm_trace abused the RTC as storage, set the timespec to 0, + * which tells the caller that this RTC value is unusable. + */ + if (!pm_trace_rtc_valid()) { + now->tv_sec = now->tv_nsec = 0; + return; + } + spin_lock_irqsave(&rtc_lock, flags); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..4bf0c8926a1c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { /* - * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * as old kexec-tools loads bzImage below that, unless + * "crashkernel=size[KMG],high" is specified. */ crash_base = memblock_find_in_range(CRASH_ALIGN, high ? CRASH_ADDR_HIGH_MAX @@ -985,6 +987,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ @@ -1152,6 +1178,20 @@ void __init setup_arch(char **cmdline_p) /* Allocate bigger log buffer */ setup_log_buf(1); + if (efi_enabled(EFI_BOOT)) { + switch (boot_params.secure_boot) { + case efi_secureboot_mode_disabled: + pr_info("Secure boot disabled\n"); + break; + case efi_secureboot_mode_enabled: + pr_info("Secure boot enabled\n"); + break; + default: + pr_info("Secure boot could not be determined\n"); + break; + } + } + reserve_initrd(); acpi_table_upgrade(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2bbd27f89802..9820d6d977c6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,7 +1,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/bootmem.h> #include <linux/percpu.h> @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/mpspec.h> #include <asm/apicdef.h> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 763af1d0de64..396c042e9d0e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index c00cb64bc0a1..d3c66a15bbde 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -259,18 +259,16 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -__visible void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { - irq_enter(); ack_APIC_irq(); __smp_reschedule_interrupt(); - irq_exit(); /* * KVM uses this interrupt to force a cpu out of guest mode */ } -__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -294,14 +292,15 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -316,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42f5eb7b4f6c..bd1f1ad35284 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -45,6 +45,9 @@ #include <linux/smp.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/topology.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task_stack.h> #include <linux/percpu.h> #include <linux/bootmem.h> #include <linux/err.h> @@ -58,7 +61,6 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> @@ -104,11 +106,21 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; -static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -263,9 +275,14 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -int topology_update_package_map(unsigned int apicid, unsigned int cpu) +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + unsigned int new; /* Called from early boot ? */ if (!physical_package_map) @@ -278,16 +295,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - if (logical_packages_frozen) { - physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package max\n", - apicid, pkg); + if (logical_packages >= __max_logical_packages) { + pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", + logical_packages, cpu, __max_logical_packages); return -ENOSPC; } new = logical_packages++; - pr_info("APIC(%x) Converting physical %u to logical package %u\n", - apicid, pkg, new); + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } physical_to_logical_pkg[pkg] = new; found: @@ -308,9 +326,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) } EXPORT_SYMBOL(topology_phys_to_logical_pkg); -static void __init smp_init_package_map(void) +static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) { - unsigned int ncpus, cpu; + unsigned int ncpus; size_t size; /* @@ -355,27 +373,9 @@ static void __init smp_init_package_map(void) size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - for_each_present_cpu(cpu) { - unsigned int apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) - continue; - if (!topology_update_package_map(apicid, cpu)) - continue; - pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); - per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; - set_cpu_possible(cpu, false); - set_cpu_present(cpu, false); - } - - if (logical_packages > __max_logical_packages) { - pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", - logical_packages, __max_logical_packages); - logical_packages_frozen = true; - __max_logical_packages = logical_packages; - } - pr_info("Max logical packages: %u\n", __max_logical_packages); + + topology_update_package_map(c->phys_proc_id, cpu); } void __init smp_store_boot_cpu_info(void) @@ -385,7 +385,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(); + smp_init_package_map(c, id); } /* @@ -436,9 +436,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->cpu_core_id == o->cpu_core_id) - return topology_sane(c, o, "smt"); + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } } else if (c->phys_proc_id == o->phys_proc_id && c->cpu_core_id == o->cpu_core_id) { @@ -471,22 +477,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags() | x86_sched_itmt_flags(); +} +#endif +#endif + static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { NULL, }, }; static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, @@ -821,14 +847,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -void smp_announce(void) -{ - int num_nodes = num_online_nodes(); - - printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", - num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { @@ -964,9 +982,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(idle))) - 1); - + idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; @@ -1111,7 +1127,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_disable_lazy_restore(cpu); + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; common_cpu_up(cpu, tidle); @@ -1331,11 +1347,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); - pr_info("CPU%d: ", 0); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); - if (is_uv_system()) - uv_system_init(); + uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1456,15 +1471,15 @@ __init void prefill_possible_map(void) possible = i; } + nr_cpu_ids = possible; + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); + reset_cpu_possible_mask(); + for (i = 0; i < possible; i++) set_cpu_possible(i, true); - for (; i < NR_CPUS; i++) - set_cpu_possible(i, false); - - nr_cpu_ids = possible; } #ifdef CONFIG_HOTPLUG_CPU @@ -1575,7 +1590,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - amd_e400_remove_cpu(raw_smp_processor_id()); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 0653788026e2..8e2b79b88e51 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -4,6 +4,8 @@ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ #include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/export.h> #include <linux/uaccess.h> diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index a23ce84a3f6c..f07f83b3611b 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -2,6 +2,7 @@ * x86 single-step support code, common to 32-bit and 64-bit. */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a55ed63b9f91..50215a4b9347 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,5 +1,6 @@ #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/fs.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 8402907825b0..b868fa1b812b 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -408,7 +408,7 @@ static __init int tboot_late_init(void) tboot_create_trampoline(); atomic_set(&ap_wfs_count, 0); - cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL, + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL, tboot_dying_cpu); #ifdef CONFIG_DEBUG_FS debugfs_create_file("tboot_log", S_IRUSR, diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c deleted file mode 100644 index 27538f183c3b..000000000000 --- a/arch/x86/kernel/test_nx.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * test_nx.c: functional test for NX functionality - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/module.h> -#include <linux/sort.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/asm.h> - -extern int rodata_test_data; - -/* - * This file checks 4 things: - * 1) Check if the stack is not executable - * 2) Check if kmalloc memory is not executable - * 3) Check if the .rodata section is not executable - * 4) Check if the .data section of a module is not executable - * - * To do this, the test code tries to execute memory in stack/kmalloc/etc, - * and then checks if the expected trap happens. - * - * Sadly, this implies having a dynamic exception handling table entry. - * ... which can be done (and will make Rusty cry)... but it can only - * be done in a stand-alone module with only 1 entry total. - * (otherwise we'd have to sort and that's just too messy) - */ - - - -/* - * We want to set up an exception handling point on our stack, - * which means a variable value. This function is rather dirty - * and walks the exception table of the module, looking for a magic - * marker and replaces it with a specific function. - */ -static void fudze_exception_table(void *marker, void *new) -{ - struct module *mod = THIS_MODULE; - struct exception_table_entry *extable; - - /* - * Note: This module has only 1 exception table entry, - * so searching and sorting is not needed. If that changes, - * this would be the place to search and re-sort the exception - * table. - */ - if (mod->num_exentries > 1) { - printk(KERN_ERR "test_nx: too many exception table entries!\n"); - printk(KERN_ERR "test_nx: test results are not reliable.\n"); - return; - } - extable = (struct exception_table_entry *)mod->extable; - extable[0].insn = (unsigned long)new; -} - - -/* - * exception tables get their symbols translated so we need - * to use a fake function to put in there, which we can then - * replace at runtime. - */ -void foo_label(void); - -/* - * returns 0 for not-executable, negative for executable - * - * Note: we cannot allow this function to be inlined, because - * that would give us more than 1 exception table entry. - * This in turn would break the assumptions above. - */ -static noinline int test_address(void *address) -{ - unsigned long result; - - /* Set up an exception table entry for our address */ - fudze_exception_table(&foo_label, address); - result = 1; - asm volatile( - "foo_label:\n" - "0: call *%[fake_code]\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: mov %[zero], %[rslt]\n" - " ret\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - : [rslt] "=r" (result) - : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) - ); - /* change the exception table back for the next round */ - fudze_exception_table(address, &foo_label); - - if (result) - return -ENODEV; - return 0; -} - -static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ - -static int test_NX(void) -{ - int ret = 0; - /* 0xC3 is the opcode for "ret" */ - char stackcode[] = {0xC3, 0x90, 0 }; - char *heap; - - test_data = 0xC3; - - printk(KERN_INFO "Testing NX protection\n"); - - /* Test 1: check if the stack is not executable */ - if (test_address(&stackcode)) { - printk(KERN_ERR "test_nx: stack was executable\n"); - ret = -ENODEV; - } - - - /* Test 2: Check if the heap is executable */ - heap = kmalloc(64, GFP_KERNEL); - if (!heap) - return -ENOMEM; - heap[0] = 0xC3; /* opcode for "ret" */ - - if (test_address(heap)) { - printk(KERN_ERR "test_nx: heap was executable\n"); - ret = -ENODEV; - } - kfree(heap); - - /* - * The following 2 tests currently fail, this needs to get fixed - * Until then, don't run them to avoid too many people getting scared - * by the error message - */ - - /* Test 3: Check if the .rodata section is executable */ - if (rodata_test_data != 0xC3) { - printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); - ret = -ENODEV; - } else if (test_address(&rodata_test_data)) { - printk(KERN_ERR "test_nx: .rodata section is executable\n"); - ret = -ENODEV; - } - -#if 0 - /* Test 4: Check if the .data section of a module is executable */ - if (test_address(&test_data)) { - printk(KERN_ERR "test_nx: .data section is executable\n"); - ret = -ENODEV; - } - -#endif - return ret; -} - -static void test_exit(void) -{ -} - -module_init(test_NX); -module_exit(test_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the NX infrastructure"); -MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c deleted file mode 100644 index 222e84e2432e..000000000000 --- a/arch/x86/kernel/test_rodata.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * test_rodata.c: functional test for mark_rodata_ro function - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <asm/cacheflush.h> -#include <asm/sections.h> -#include <asm/asm.h> - -int rodata_test(void) -{ - unsigned long result; - unsigned long start, end; - - /* test 1: read the value */ - /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); - return -ENODEV; - } - - /* test 2: write to the variable; this should fault */ - /* - * If this test fails, we managed to overwrite the data - * - * This is written in assembly to be able to catch the - * exception that is supposed to happen in the correct - * case - */ - - result = 1; - asm volatile( - "0: mov %[zero],(%[rodata_test])\n" - " mov %[zero], %[rslt]\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - : [rslt] "=r" (result) - : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) - ); - - - if (!result) { - printk(KERN_ERR "rodata_test: test data was not read only\n"); - return -ENODEV; - } - - /* test 3: check the value hasn't changed */ - /* If this test fails, we managed to overwrite the data */ - if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n"); - return -ENODEV; - } - /* test 4: check if the rodata section is 4Kb aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { - printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); - return -ENODEV; - } - if (end & (PAGE_SIZE - 1)) { - printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); - return -ENODEV; - } - - return 0; -} diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9692a5e9fdab..6c8934406dc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,7 +5,7 @@ #include <linux/regset.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..15515132bf0d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -34,7 +34,7 @@ static void switch_idt(void *arg) local_irq_restore(flags); } -void trace_irq_vector_regfunc(void) +int trace_irq_vector_regfunc(void) { mutex_lock(&irq_vector_mutex); if (!trace_irq_vector_refcount) { @@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void) } trace_irq_vector_refcount++; mutex_unlock(&irq_vector_mutex); + return 0; } void trace_irq_vector_unregfunc(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bd4e3d4d3625..948443e115c1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> @@ -563,11 +564,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_disable(); cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -742,14 +741,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_disable(); cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } @@ -769,7 +766,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); cond_local_irq_disable(regs); - preempt_enable_no_resched(); debug_stack_usage_dec(); exit: @@ -853,6 +849,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { + unsigned long cr0; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION @@ -866,10 +864,20 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - fpu__restore(¤t->thread.fpu); /* interrupts still off */ -#ifdef CONFIG_X86_32 - cond_local_irq_enable(regs); -#endif + + /* This should not happen. */ + cr0 = read_cr0(); + if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { + /* Try to fix it up and carry on. */ + write_cr0(cr0 & ~X86_CR0_TS); + } else { + /* + * Something terrible happened, and we're better off trying + * to kill the task than getting stuck in a never-ending + * loop of #NM faults. + */ + die("unexpected #NM exception", regs, error_code); + } } NOKPROBE_SYMBOL(do_device_not_available); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..46bcda4cb1c2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> @@ -694,6 +695,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -702,6 +704,20 @@ unsigned long native_calibrate_tsc(void) } } + /* + * TSC frequency determined by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return crystal_khz * ebx_numerator / eax_denominator; } @@ -1043,18 +1059,20 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); - - /* Don't enable ART in a VM, non-stop TSC required */ + /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - art_to_tsc_denominator < ART_MIN_DENOMINATOR) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; - if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } @@ -1064,6 +1082,11 @@ static void detect_art(void) static struct clocksource clocksource_tsc; +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a @@ -1080,9 +1103,19 @@ static struct clocksource clocksource_tsc; * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ -static cycle_t read_tsc(struct clocksource *cs) +static u64 read_tsc(struct clocksource *cs) { - return (cycle_t)rdtsc_ordered(); + return (u64)rdtsc_ordered(); +} + +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + tsc_unstable = 1; + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); } /* @@ -1096,6 +1129,8 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, }; void mark_tsc_unstable(char *reason) @@ -1170,7 +1205,7 @@ int unsynchronized_tsc(void) /* * Convert ART to TSC given numerator/denominator found in detect_art() */ -struct system_counterval_t convert_art_to_tsc(cycle_t art) +struct system_counterval_t convert_art_to_tsc(u64 art) { u64 tmp, res, rem; @@ -1283,10 +1318,10 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* - * Trust the results of the earlier calibration on systems - * exporting a reliable TSC. + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1333,6 +1368,9 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..19afdbd7d0a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void) #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; #endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return res; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 78083bf23ed1..728f75378475 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -14,18 +14,166 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> #include <asm/tsc.h> +struct tsc_adjust { + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); + +void tsc_verify_tsc_adjust(bool resume) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Rate limit the MSR check */ + if (!resume && time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned || resume) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * But we always force positive ADJUST values. Otherwise the TSC + * deadline timer creates an interrupt storm. We also have to + * prevent values > 0x7FFFFFFF as those wreckage the timer as well. + */ + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || + (bootval > 0x7FFFFFFF)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, + bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } + cur->adjusted = bootval; +} + +#ifndef CONFIG_SMP +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); + return false; +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +bool tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. + */ + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; + + if (refcpu >= nr_cpu_ids) { + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); + return false; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) { + pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->bootval, cpu, bootval); + } + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->adjusted, cpu, bootval); + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; +} + /* * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ static atomic_t start_count; static atomic_t stop_count; +static atomic_t skip_test; +static atomic_t test_runs; /* * We use a raw spinlock in this exceptional case, because @@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static cycles_t last_tsc; static cycles_t max_warp; static int nr_warps; +static int random_warps; /* * TSC-warp measurement loop running on both CPUs. This is not called * if there is no TSC. */ -static void check_tsc_warp(unsigned int timeout) +static cycles_t check_tsc_warp(unsigned int timeout) { - cycles_t start, now, prev, end; - int i; + cycles_t start, now, prev, end, cur_max_warp = 0; + int i, cur_warps = 0; start = rdtsc_ordered(); /* @@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; nr_warps++; + cur_warps = nr_warps; arch_spin_unlock(&sync_lock); } } WARN(!(now-start), "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); + return cur_max_warp; } /* @@ -128,23 +286,27 @@ void check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - /* - * Reset it - in case this is a second bootup: + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust */ - atomic_set(&stop_count, 0); - + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: /* - * Wait for the target to arrive: + * Wait for the target to start or to skip the test: */ - while (atomic_read(&start_count) != cpus-1) + while (atomic_read(&start_count) != cpus - 1) { + if (atomic_read(&skip_test) > 0) { + atomic_set(&skip_test, 0); + return; + } cpu_relax(); + } + /* * Trigger the target to continue into the measurement too: */ @@ -155,21 +317,35 @@ void check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - if (nr_warps) { + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); } /* * Reset it - just in case we boot another CPU later: */ atomic_set(&start_count, 0); + random_warps = 0; nr_warps = 0; max_warp = 0; last_tsc = 0; @@ -178,6 +354,12 @@ void check_tsc_sync_source(int cpu) * Let the target continue with the bootup: */ atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; } /* @@ -185,12 +367,30 @@ void check_tsc_sync_source(int cpu) */ void check_tsc_sync_target(void) { + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; int cpus = 2; /* Also aborts if there is no TSC. */ - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc()) + return; + + /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. + */ + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { + atomic_inc(&skip_test); return; + } +retry: /* * Register this CPU's participation and wait for the * source CPU to start the measurement: @@ -199,7 +399,12 @@ void check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(loop_timeout(smp_processor_id())); + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; /* * Ok, we are done: @@ -211,4 +416,61 @@ void check_tsc_sync_target(void) */ while (atomic_read(&stop_count) != cpus) cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustement value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + /* + * TSC deadline timer stops working or creates an interrupt storm + * with adjust values < 0 and > x07ffffff. + * + * To allow adjust values > 0x7FFFFFFF we need to disable the + * deadline timer and use the local APIC timer, but that requires + * more intrusive changes and we do not have any useful information + * from Intel about the underlying HW wreckage yet. + */ + if (cur->adjusted < 0) + cur->adjusted = 0; + if (cur->adjusted > 0x7FFFFFFF) + cur->adjusted = 0x7FFFFFFF; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + } + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a2456d4d286a..478d15dbaee4 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,4 +1,6 @@ #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <asm/ptrace.h> #include <asm/bitops.h> #include <asm/stacktrace.h> @@ -6,6 +8,52 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static void unwind_dump(struct unwind_state *state, unsigned long *sp) +{ + static bool dumped_before = false; + bool prev_zero, zero = false; + unsigned long word; + + if (dumped_before) + return; + + dumped_before = true; + + printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); + + for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; + + if (zero) { + if (!prev_zero) + printk_deferred("%p: %016x ...\n", sp, 0); + continue; + } + + printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + } +} + unsigned long unwind_get_return_address(struct unwind_state *state) { unsigned long addr; @@ -14,17 +62,60 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (unwind_done(state)) return 0; - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + if (state->regs && user_mode(state->regs)) + return 0; + + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return __kernel_text_address(addr) ? addr : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static size_t regs_size(struct pt_regs *regs) +{ + /* x86_32 regs from kernel mode are two words shorter: */ + if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) + return sizeof(*regs) - 2*sizeof(long); + + return sizeof(*regs); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + unsigned long bp = (unsigned long)state->bp; + unsigned long regs = (unsigned long)task_pt_regs(state->task); + + /* + * We have to check for the last task frame at two different locations + * because gcc can occasionally decide to realign the stack pointer and + * change the offset of the stack frame by a word in the prologue of a + * function called by head/entry code. + */ + return bp == regs - FRAME_HEADER_SIZE || + bp == regs - FRAME_HEADER_SIZE - sizeof(long); +} + +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} + static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { struct stack_info *info = &state->stack_info; + enum stack_type orig_type = info->type; /* * If addr isn't on the current stack, switch to the next one. @@ -38,31 +129,137 @@ static bool update_stack_state(struct unwind_state *state, void *addr, &state->stack_mask)) return false; + if (!state->orig_sp || info->type != orig_type) + state->orig_sp = addr; + return true; } bool unwind_next_frame(struct unwind_state *state) { - unsigned long *next_bp; + struct pt_regs *regs; + unsigned long *next_bp, *next_frame; + size_t next_len; + enum stack_type prev_type = state->stack_info.type; if (unwind_done(state)) return false; - next_bp = (unsigned long *)*state->bp; + /* have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread_tls(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * do_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + return true; + } + + /* get the next frame pointer */ + if (state->regs) + next_bp = (unsigned long *)state->regs->bp; + else + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); + + /* is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + next_frame = (unsigned long *)regs; + next_len = sizeof(*regs); + } else { + next_frame = next_bp; + next_len = FRAME_HEADER_SIZE; + } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) - return false; + if (!update_stack_state(state, next_frame, next_len)) { + /* + * Don't warn on bad regs->bp. An interrupt in entry code + * might cause a false positive warning. + */ + if (state->regs) + goto the_end; + + goto bad_address; + } + + /* Make sure it only unwinds up and doesn't overlap the last frame: */ + if (state->stack_info.type == prev_type) { + if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) + goto bad_address; + + if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) + goto bad_address; + } /* move to the next frame */ - state->bp = next_bp; + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + return true; + +bad_address: + /* + * When unwinding a non-current task, the task might actually be + * running on another CPU, in which case it could be modifying its + * stack while we're reading it. This is generally not a problem and + * can be ignored as long as the caller understands that unwinding + * another task will not always succeed. + */ + if (state->task != current) + goto the_end; + + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_frame); + unwind_dump(state, (unsigned long *)state->regs); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_frame); + unwind_dump(state, state->bp); + } +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; } EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + unsigned long *bp, *frame; + size_t len; + memset(state, 0, sizeof(*state)); state->task = task; @@ -73,12 +270,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } /* set up the starting stack frame */ - state->bp = get_frame_pointer(task, regs); + bp = get_frame_pointer(task, regs); + regs = decode_frame_pointer(bp); + if (regs) { + state->regs = regs; + frame = (unsigned long *)regs; + len = sizeof(*regs); + } else { + state->bp = bp; + frame = bp; + len = FRAME_HEADER_SIZE; + } /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(state->bp, state->task, &state->stack_info, + get_stack_info(frame, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + update_stack_state(state, frame, len); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 01f30e56f99e..23ee89ce59a9 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -35,6 +35,7 @@ #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/string.h> @@ -47,7 +48,7 @@ #include <linux/slab.h> #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> @@ -160,11 +161,12 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) static void mark_screen_rdonly(struct mm_struct *mm) { + struct vm_area_struct *vma; + spinlock_t *ptl; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - spinlock_t *ptl; int i; down_write(&mm->mmap_sem); @@ -177,7 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pmd = pmd_offset(pud, 0xA0000); if (pmd_trans_huge(*pmd)) { - struct vm_area_struct *vma = find_vma(mm, 0xA0000); + vma = find_vma(mm, 0xA0000); split_huge_pmd(vma, pmd, 0xA0000); } if (pmd_none_or_clear_bad(pmd)) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index dbf67f64d5ec..c74ae9ce8dc4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -91,10 +91,10 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; /* bootstrapping code */ HEAD_TEXT . = ALIGN(8); - _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT @@ -345,7 +345,6 @@ SECTIONS DISCARDS /DISCARD/ : { *(.eh_frame) - *(__func_stack_frame_non_standard) } } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0bd9f1287f39..11a93f005268 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -89,7 +89,6 @@ struct x86_cpuinit_ops x86_cpuinit = { }; static void default_nmi_init(void) { }; -static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, @@ -100,7 +99,6 @@ struct x86_platform_ops x86_platform __ro_after_init = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb596cd..efde6cc50875 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,7 +16,9 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ +#include <linux/sched/stat.h> + +#include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -65,6 +67,11 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) +/* These are scattered features in cpufeatures.h. */ +#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +#define KVM_CPUID_BIT_AVX512_4FMAPS 3 +#define KF(x) bit(KVM_CPUID_BIT_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -81,6 +88,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= F(OSXSAVE); } + best->edx &= ~F(APIC); + if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) + best->edx |= F(APIC); + if (apic) { if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -114,9 +125,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); - /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -365,16 +373,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(AVX512BW) | F(AVX512VL); + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + const u32 kvm_cpuid_7_0_ecx_x86_features = + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -458,12 +471,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: @@ -846,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; @@ -863,17 +872,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } EXPORT_SYMBOL_GPL(kvm_cpuid); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - u32 function, eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx; - function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d260d68..45c7306c8780 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -158,9 +158,11 @@ #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ -#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ +#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ @@ -171,6 +173,7 @@ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ +#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -446,6 +449,26 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; +/* + * XXX: inoutclob user must know where the argument is being expanded. + * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + */ +#define asm_safe(insn, inoutclob...) \ +({ \ + int _fault = 0; \ + \ + asm volatile("1:" insn "\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: movl $1, %[_fault]\n" \ + " jmp 2b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [_fault] "+qm"(_fault) inoutclob ); \ + \ + _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ +}) + static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -632,21 +655,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is - * subject to the same check. + * subject to the same check. FXSAVE and FXRSTOR are checked here too as their + * 512 bytes of data must be aligned to a 16 byte boundary. */ -static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (likely(size < 16)) - return false; + u64 alignment = ctxt->d & AlignMask; - if (ctxt->d & Aligned) - return true; - else if (ctxt->d & Unaligned) - return false; - else if (ctxt->d & Avx) - return false; - else - return true; + if (likely(size < 16)) + return 1; + + switch (alignment) { + case Unaligned: + case Avx: + return 1; + case Aligned16: + return 16; + case Aligned: + default: + return size; + } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, @@ -704,7 +732,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: @@ -791,6 +819,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } +static int segmented_write_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned int size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); +} + /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. @@ -1544,7 +1586,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, &ctxt->exception); } -/* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1581,20 +1622,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; - /* NULL selector is not valid for TR, CS and SS (except for long mode) */ - if ((seg == VCPU_SREG_CS - || (seg == VCPU_SREG_SS - && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) - || seg == VCPU_SREG_TR) - && null_selector) - goto exception; - /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; - if (null_selector) /* for NULL selector skip all following checks */ + /* NULL selector is not valid for TR, CS and (except for long mode) SS */ + if (null_selector) { + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) + goto exception; + + if (seg == VCPU_SREG_SS) { + if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) + goto exception; + + /* + * ctxt->ops->set_segment expects the CPL to be in + * SS.DPL, so fake an expand-up 32-bit data segment. + */ + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + seg_desc.dpl = cpl; + seg_desc.d = 1; + seg_desc.g = 1; + } + + /* Skip all following checks */ goto load; + } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) @@ -1710,6 +1765,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); + + /* + * None of MOV, POP and LSS can load a NULL selector in CPL=3, but + * they can load it at CPL<3 (Intel's manual says only LSS can, + * but it's wrong). + * + * However, the Intel manual says that putting IST=1/DPL=3 in + * an interrupt gate will result in SS=3 (the AMD manual instead + * says it doesn't), so allow SS=3 in __load_segment_descriptor + * and only forbid it here. + */ + if (seg == VCPU_SREG_SS && selector == 3 && + ctxt->mode == X86EMUL_MODE_PROT64) + return emulate_exception(ctxt, GP_VECTOR, 0, true); + return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } @@ -3658,8 +3728,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, } /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, ctxt->dst.addr.mem, - &desc_ptr, 2 + ctxt->op_bytes); + return segmented_write_std(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); } static int em_sgdt(struct x86_emulate_ctxt *ctxt) @@ -3842,6 +3912,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_fxsr(struct x86_emulate_ctxt *ctxt) +{ + u32 eax = 1, ebx, ecx = 0, edx; + + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(edx & FFL(FXSR))) + return emulate_ud(ctxt); + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + /* + * Don't emulate a case that should never be hit, instead of working + * around a lack of fxsave64/fxrstor64 on old compilers. + */ + if (ctxt->mode >= X86EMUL_MODE_PROT64) + return X86EMUL_UNHANDLEABLE; + + return X86EMUL_CONTINUE; +} + +/* + * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, + * 1) 16 bit mode + * 2) 32 bit mode + * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs + * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. + * save and restore + * 3) 64-bit mode with REX.W prefix + * - like (2), but XMM 8-15 are being saved and restored + * 4) 64-bit mode without REX.W prefix + * - like (3), but FIP and FDP are 64 bit + * + * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the + * desired result. (4) is not emulated. + * + * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS + * and FPU DS) should match. + */ +static int em_fxsave(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + size_t size; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->ops->get_fpu(ctxt); + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) + size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); + else + size = offsetof(struct fxregs_state, xmm_space[0]); + + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); +} + +static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, + struct fxregs_state *new) +{ + int rc = X86EMUL_CONTINUE; + struct fxregs_state old; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* + * 64 bit host will restore XMM 8-15, which is not correct on non-64 + * bit guests. Load the current values in order to preserve 64 bit + * XMMs after fxrstor. + */ +#ifdef CONFIG_X86_64 + /* XXX: accessing XMM 8-15 very awkwardly */ + memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); +#endif + + /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but + * does save and restore MXCSR. + */ + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) + memcpy(new->xmm_space, old.xmm_space, 8 * 16); + + return rc; +} + +static int em_fxrstor(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (fx_state.mxcsr >> 16) + return emulate_gp(ctxt, 0); + + ctxt->ops->get_fpu(ctxt); + + if (ctxt->mode < X86EMUL_MODE_PROT64) + rc = fxrstor_fixup(ctxt, &fx_state); + + if (rc == X86EMUL_CONTINUE) + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + return rc; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4104,7 +4299,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -4142,7 +4337,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), - I(SrcMem | Stack, em_push), D(Undefined), + I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -4194,7 +4389,9 @@ static const struct gprefix pfx_0f_ae_7 = { }; static const struct group_dual group15 = { { - N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), + I(ModRM | Aligned16, em_fxsave), + I(ModRM | Aligned16, em_fxrstor), + N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; @@ -4360,8 +4557,8 @@ static const struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), + I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), + F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), @@ -5066,21 +5263,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { - bool fault = false; + int rc; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + rc = asm_safe("fwait"); ctxt->ops->put_fpu(ctxt); - if (unlikely(fault)) + if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; @@ -5483,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } + +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->rep_prefix && (ctxt->d & String)) + return false; + + if (ctxt->d & TwoMemOp) + return false; + + return true; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 42b1c83741c8..ebae57ac5902 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -28,6 +28,8 @@ #include <linux/kvm_host.h> #include <linux/highmem.h> +#include <linux/sched/cputime.h> + #include <asm/apicdef.h> #include <trace/events/kvm.h> @@ -291,7 +293,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) return ret; } -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); struct kvm_lapic_irq irq; @@ -305,13 +307,13 @@ int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return -ENOENT; memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } @@ -852,6 +854,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; + mutex_lock(&kvm->arch.hyperv.hv_lock); + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a @@ -859,7 +865,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) - return; + goto out_unlock; /* * While we're computing and writing the parameters, force the @@ -868,15 +874,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - return; + goto out_unlock; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - return; + goto out_unlock; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - return; + goto out_unlock; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -891,6 +897,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = tsc_seq; kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +out_unlock: + mutex_unlock(&kvm->arch.hyperv.hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, @@ -958,10 +966,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { - cputime_t utime, stime; + u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); - return div_u64(cputime_to_nsecs(utime + stime), 100); + + return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) @@ -1142,9 +1151,9 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1155,9 +1164,9 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata); @@ -1165,7 +1174,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) bool kvm_hv_hypercall_enabled(struct kvm *kvm) { - return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; + return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 16a7134eedac..a78b445ce411 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); + kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - kthread_queue_work(&pt->worker, &pt->expired); + kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) + pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); @@ -713,7 +711,7 @@ fail_register_speaker: fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: @@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 2f5af0798326..600bee9dcbbd 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,8 +44,7 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; + struct kthread_worker *worker; struct kthread_work expired; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..73ea24d4f119 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = { .write = picdev_eclr_write, }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm) +int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) - return NULL; + return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; @@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); - return s; + kvm->arch.vpic = s; + + return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); @@ -648,13 +650,17 @@ fail_unlock: kfree(s); - return NULL; + return ret; } -void kvm_destroy_pic(struct kvm_pic *vpic) +void kvm_pic_destroy(struct kvm *kvm) { + struct kvm_pic *vpic = kvm->arch.vpic; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + + kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731eb3897..40d5b2cf6061 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -73,8 +73,8 @@ struct kvm_pic { unsigned long irq_states[PIC_NUM_PINS]; }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm_pic *vpic); +int kvm_pic_init(struct kvm *kvm); +void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm) static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_split; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; } -static inline int irqchip_in_kernel(struct kvm *kvm) +static inline int irqchip_kernel(struct kvm *kvm) { - struct kvm_pic *vpic = pic_irqchip(kvm); - bool ret; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; +} - ret = (vpic != NULL); - ret |= irqchip_split(kvm); +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; - /* Read vpic before kvm->irq_routing. */ + /* Matches with wmb after initializing kvm->irq_routing. */ smp_rmb(); return ret; } diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c0191615f23..6825cd36d13b 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -23,6 +23,8 @@ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/rculist.h> + #include <trace/events/kvm.h> #include <asm/msidef.h> @@ -41,15 +43,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); - - /* - * XXX: rejecting pic routes when pic isn't in use would be better, - * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. - */ - if (!pic) - return -1; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -58,10 +51,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) - return -1; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -297,16 +286,20 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: - e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; - break; case KVM_IRQCHIP_PIC_SLAVE: + delta = 8; + /* fall through */ + case KVM_IRQCHIP_PIC_MASTER: + if (!pic_in_kernel(kvm)) + goto out; + e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; - delta = 8; break; case KVM_IRQCHIP_IOAPIC: + if (!ioapic_in_kernel(kvm)) + goto out; + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; @@ -409,7 +402,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) void kvm_arch_post_irq_routing_update(struct kvm *kvm) { - if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + if (!irqchip_split(kvm)) return; kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340f9fa3..bad6a25067bc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + +static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) +{ + return apic->vcpu->vcpu_id; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - u32 max_id = 255; + u32 max_id = 255; /* enough space for any xAPIC ID */ mutex_lock(&kvm->arch.apic_map_lock); kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) - max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); @@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; u16 mask; - u32 ldr, aid; + u32 ldr; + u8 xapic_id; + u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; - aid = kvm_apic_id(apic); - ldr = kvm_lapic_get_reg(apic, APIC_LDR); + xapic_id = kvm_xapic_id(apic); + x2apic_id = kvm_x2apic_id(apic); + + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + /* + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, + * prevent them from masking VCPUs with APIC ID <= 0xff. + */ + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; - if (aid <= new->max_apic_id) - new->phys_map[aid] = apic; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; @@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); @@ -317,7 +341,7 @@ static int find_highest_vector(void *bitmap) vec >= 0; vec -= APIC_VECTORS_PER_REG) { reg = bitmap + REG_POS(vec); if (*reg) - return fls(*reg) - 1 + vec; + return __fls(*reg) + vec; } return -1; @@ -337,25 +361,32 @@ static u8 count_vectors(void *bitmap) return count; } -void __kvm_apic_update_irr(u32 *pir, void *regs) +int __kvm_apic_update_irr(u32 *pir, void *regs) { - u32 i, pir_val; + u32 i, vec; + u32 pir_val, irr_val; + int max_irr = -1; - for (i = 0; i <= 7; i++) { - pir_val = xchg(&pir[i], 0); - if (pir_val) - *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + for (i = vec = 0; i <= 7; i++, vec += 32) { + pir_val = READ_ONCE(pir[i]); + irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); + if (pir_val) { + irr_val |= xchg(&pir[i], 0); + *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; + } + if (irr_val) + max_irr = __fls(irr_val) + vec; } + + return max_irr; } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) { struct kvm_lapic *apic = vcpu->arch.apic; - __kvm_apic_update_irr(pir, apic->regs); - - kvm_make_request(KVM_REQ_EVENT, vcpu); + return __kvm_apic_update_irr(pir, apic->regs); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); @@ -375,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - if (apic->vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -390,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; if (unlikely(vcpu->arch.apicv_active)) { - /* try to update RVI */ + /* need to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_x86_ops->hwapic_irr_update(vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; apic_clear_vector(vec, apic->regs + APIC_IRR); @@ -482,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) */ return apic_find_highest_irr(vcpu->arch.apic); } +EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, @@ -498,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -544,7 +573,19 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void apic_update_ppr(struct kvm_lapic *apic) +static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) +{ + int highest_irr; + if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active) + highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + else + highest_irr = apic_find_highest_irr(apic); + if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) + return -1; + return highest_irr; +} + +static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; @@ -562,13 +603,28 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { + *new_ppr = ppr; + if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + + return ppr < old_ppr; } +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 ppr; + + if (__apic_update_ppr(apic, &ppr) && + apic_has_interrupt_for_ppr(apic, ppr) != -1) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} + +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) +{ + apic_update_ppr(vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); + static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); @@ -577,10 +633,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - if (apic_x2apic_mode(apic)) - return mda == X2APIC_BROADCAST; - - return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; + return mda == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) @@ -589,9 +643,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) return true; if (apic_x2apic_mode(apic)) - return mda == kvm_apic_id(apic); + return mda == kvm_x2apic_id(apic); - return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); + /* + * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if + * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and + * this allows unique addressing of VCPUs with APIC ID over 0xff. + * The 0xff condition is needed because writeable xAPIC ID. + */ + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) + return true; + + return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -608,7 +671,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); - mda = GET_APIC_DEST_FIELD(mda); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -625,9 +687,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) /* The KVM local APIC implementation has two quirks: * - * - the xAPIC MDA stores the destination at bits 24-31, while this - * is not true of struct kvm_lapic_irq's dest_id field. This is - * just a quirk in the API and is not problematic. + * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs + * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. + * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. @@ -643,13 +705,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; - bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && - !ipi && dest_id == APIC_BROADCAST && x2apic_mda) + !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; - return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); + return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, @@ -1090,7 +1151,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - ktime_t remaining; + ktime_t remaining, now; s64 ns; u32 tmcct; @@ -1101,9 +1162,10 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) apic->lapic_timer.period == 0) return 0; - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) - remaining = ktime_set(0, 0); + remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); tmcct = div64_u64(ns, @@ -1332,7 +1394,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); - now = apic->lapic_timer.timer.base->get_time(); + now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; @@ -1347,6 +1409,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (apic_lvtt_oneshot(apic) && + ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + return; + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); +} + +static bool set_target_expiration(struct kvm_lapic *apic) +{ + ktime_t now; + u64 tscl = rdtsc(); + + now = ktime_get(); + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return false; + + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + kvm_lapic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); + + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + + return true; +} + +static void advance_periodic_target_expiration(struct kvm_lapic *apic) +{ + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = + ktime_add_ns(apic->lapic_timer.target_expiration, + apic->lapic_timer.period); +} + bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) @@ -1356,52 +1491,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +static void cancel_hv_timer(struct kvm_lapic *apic) { kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) +static bool start_hv_timer(struct kvm_lapic *apic) { u64 tscdeadline = apic->lapic_timer.tscdeadline; - if (atomic_read(&apic->lapic_timer.pending) || + if ((atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) || kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); } else { apic->lapic_timer.hv_timer_in_use = true; hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); + if (atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) + cancel_hv_timer(apic); } trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); return apic->lapic_timer.hv_timer_in_use; } +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_timer(apic); + apic_timer_expired(apic); + + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + advance_periodic_target_expiration(apic); + if (!start_hv_timer(apic)) + start_sw_period(apic); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); + start_hv_timer(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1413,62 +1555,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); if (atomic_read(&apic->lapic_timer.pending)) return; - start_sw_tscdeadline(apic); + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or periodic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS_PINNED); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); + if (set_target_expiration(apic) && + !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) + start_sw_period(apic); } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) start_sw_tscdeadline(apic); } } @@ -1701,13 +1809,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!lapic_in_kernel(vcpu)) + return 0; + + return apic->lapic_timer.tscdeadline; +} u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!lapic_in_kernel(vcpu) || + !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1748,14 +1865,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) { + if (!apic) value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } vcpu->arch.apic_base = value; + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) + kvm_update_cpuid(vcpu); + + if (!apic) + return; + /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { @@ -1846,9 +1966,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug("%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), + vcpu, kvm_lapic_get_reg(apic, APIC_ID), vcpu->arch.apic_base, apic->base_address); } @@ -1909,6 +2029,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic); if (lapic_is_periodic(apic)) { + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1959,17 +2080,13 @@ nomem: int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; + u32 ppr; if (!apic_enabled(apic)) return -1; - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; + __apic_update_ppr(apic, &ppr); + return apic_has_interrupt_for_ppr(apic, ppr); } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) @@ -1993,6 +2110,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) apic->lapic_timer.tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; + apic->lapic_timer.target_expiration = 0; + } atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2001,6 +2122,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + u32 ppr; if (vector == -1) return -1; @@ -2012,13 +2134,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) * because the process would deliver it through the IDT. */ - apic_set_isr(vector, apic); - apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); + /* + * For auto-EOI interrupts, there might be another pending + * interrupt above PPR, so check whether to raise another + * KVM_REQ_EVENT. + */ apic_update_ppr(apic); + } else { + /* + * For normal interrupts, PPR has been raised and there cannot + * be a higher-priority pending interrupt---except if there was + * a concurrent interrupt injection, but that would have + * triggered KVM_REQ_EVENT already. + */ + apic_set_isr(vector, apic); + __apic_update_ppr(apic, &ppr); } return vector; @@ -2079,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - if (kvm_x86_ops->apicv_post_state_restore) - kvm_x86_ops->apicv_post_state_restore(vcpu); + kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); kvm_x86_ops->hwapic_isr_update(vcpu, @@ -2154,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2207,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2308,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } @@ -2360,3 +2491,9 @@ void kvm_lapic_init(void) jump_label_rate_limit(&apic_hw_disabled, HZ); jump_label_rate_limit(&apic_sw_disabled, HZ); } + +void kvm_lapic_exit(void) +{ + static_key_deferred_flush(&apic_hw_disabled); + static_key_deferred_flush(&apic_sw_disabled); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c29d51..bcbe811f3b97 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -15,6 +15,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + ktime_t target_expiration; u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; @@ -70,8 +71,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); -void __kvm_apic_update_irr(u32 *pir, void *regs); -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +int __kvm_apic_update_irr(u32 *pir, void *regs); +int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -85,6 +87,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -108,6 +111,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); +void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) @@ -200,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline u32 kvm_apic_id(struct kvm_lapic *apic) -{ - /* To avoid a race between apic_base and following APIC_ID update when - * switching to x2apic_mode, the x2apic mode returns initial x2apic id. - */ - if (apic_x2apic_mode(apic)) - return apic->vcpu->vcpu_id; - - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..ac7810513d0e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -36,7 +36,10 @@ #include <linux/compiler.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/uaccess.h> +#include <linux/hash.h> +#include <linux/kern_levels.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -129,6 +132,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -178,15 +185,40 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { - shadow_mmio_mask = mmio_mask; + shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + /* Always false if shadow_acc_track_mask is zero. */ + return (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -284,17 +316,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { + if (acc_track_mask != 0) + acc_track_mask |= SPTE_SPECIAL_MASK; + shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -307,7 +357,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -324,6 +374,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -473,7 +528,7 @@ retry: } #endif -static bool spte_is_locklessly_modifiable(u64 spte) +static bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); @@ -481,36 +536,38 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { + if (!is_shadow_present_pte(spte)) + return false; + /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_is_locklessly_modifiable(spte)) + if (spte_can_locklessly_be_made_writable(spte) || + is_access_track_spte(spte)) return true; - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (shadow_accessed_mask) { + if ((spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + return true; + } - return true; + return false; } -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_accessed_spte(u64 spte) { - return (old_spte & bit_mask) && !(new_spte & bit_mask); + return shadow_accessed_mask ? spte & shadow_accessed_mask + : !is_access_track_spte(spte); } -static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_dirty_spte(u64 spte) { - return (old_spte & bit_mask) != (new_spte & bit_mask); + return shadow_dirty_mask ? spte & shadow_dirty_mask + : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -525,25 +582,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. +/* + * Update the SPTE (excluding the PFN), but do not track changes in its + * accessed/dirty status. */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) +static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool ret = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return ret; + return old_spte; } if (!spte_has_volatile_bits(old_spte)) @@ -551,45 +602,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + + return old_spte; +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. + * + * Returns true if the TLB needs to be flushed + */ +static bool mmu_spte_update(u64 *sptep, u64 new_spte) +{ + bool flush = false; + u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); + + if (!is_shadow_present_pte(old_spte)) + return false; + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_is_locklessly_modifiable(old_spte) && + if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) - ret = true; - - if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; - } + flush = true; /* - * Flush TLB when accessed/dirty bits are changed in the page tables, + * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ - if (spte_is_bit_changed(old_spte, new_spte, - shadow_accessed_mask | shadow_dirty_mask)) - ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { + flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + } + + if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { + flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } - return ret; + return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. + * Returns non-zero if the PTE was previously valid. */ static int mmu_spte_clear_track_bits(u64 *sptep) { @@ -613,11 +681,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) + + if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); + return 1; } @@ -636,6 +705,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Restore an acc-track PTE back to a regular PTE */ +static u64 restore_acc_track_spte(u64 spte) +{ + u64 new_spte = spte; + u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + WARN_ON_ONCE(!is_access_track_spte(spte)); + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + + return new_spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (!is_accessed_spte(spte)) + return false; + + if (shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + + return true; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1212,7 +1353,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_is_locklessly_modifiable(spte))) + !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); @@ -1420,7 +1561,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1433,7 +1574,8 @@ restart: new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1595,15 +1737,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1615,24 +1750,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int young = 0; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - break; - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + if (is_accessed_spte(*sptep)) + return 1; out: - return young; + return 0; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1660,17 +1791,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) { - /* - * We are holding the kvm->mmu_lock, and we are blowing up - * shadow PTEs. MMU notifier consumers need to be kept at bay. - * This is correct as long as we don't decouple the mmu_lock - * protected regions (like invalidate_range_start|end does). - */ - kvm->mmu_notifier_seq++; + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - } return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -1721,7 +1844,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1912,17 +2035,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_valid_sp() has skipped that kind of pages. + * for_each_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ - || (_sp)->role.invalid) {} else + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct) {} else + for_each_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2124,6 +2247,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool need_sync = false; bool flush = false; + int collisions = 0; LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; @@ -2138,7 +2262,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { + for_each_valid_sp(vcpu->kvm, sp, gfn) { + if (sp->gfn != gfn) { + collisions++; + continue; + } + if (!need_sync && sp->unsync) need_sync = true; @@ -2161,7 +2290,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); - return sp; + goto out; } ++vcpu->kvm->stat.mmu_cache_miss; @@ -2191,6 +2320,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); +out: + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } @@ -2591,6 +2723,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2644,7 +2779,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2877,33 +3012,43 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + return false; + /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) - return false; - return true; + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } +/* + * Returns true if the SPTE was fixed successfully. Otherwise, + * someone else modified the SPTE from its original value. + */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, u64 new_spte) { gfn_t gfn; WARN_ON(!sp->role.direct); /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML @@ -2915,12 +3060,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + return false; + + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } +static bool is_access_allowed(u32 fault_err_code, u64 spte) +{ + if (fault_err_code & PFERR_FETCH_MASK) + return is_executable_pte(spte); + + if (fault_err_code & PFERR_WRITE_MASK) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * Return value: * - true: let the vcpu to access on the same address again. @@ -2931,8 +3097,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool ret = false; + bool fault_handled = false; u64 spte = 0ull; + uint retry_count = 0; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; @@ -2941,66 +3108,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || iterator.level < level) + + do { + u64 new_spte; + + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; + + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) break; - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - ret = true; - goto exit; - } + /* + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_access_allowed(error_code, spte)) { + fault_handled = true; + break; + } - sp = page_header(__pa(iterator.sptep)); - if (!is_last_spte(spte, sp->role.level)) - goto exit; + new_spte = spte; - /* - * Check if it is a spurious fault caused by TLB lazily flushed. - * - * Need not check the access of upper level table entries since - * they are always ACC_ALL. - */ - if (is_writable_pte(spte)) { - ret = true; - goto exit; - } + if (is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte); - /* - * Currently, to simplify the code, only the spte write-protected - * by dirty-log can be fast fixed. - */ - if (!spte_is_locklessly_modifiable(spte)) - goto exit; + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging or access tracking. + */ + if ((error_code & PFERR_WRITE_MASK) && + spte_can_locklessly_be_made_writable(spte)) + { + new_spte |= PT_WRITABLE_MASK; - /* - * Do not fix write-permission on the large spte since we only dirty - * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() - * that means other pages are missed if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte to - * fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - goto exit; + /* + * Do not fix write-permission on the large spte. Since + * we only dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte(), other pages are missed + * if its slot has dirty logging enabled. + * + * Instead, we let the slow page fault path create a + * normal spte to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + break; + } + + /* Verify that the fault can be handled in the fast path */ + if (new_spte == spte || + !is_access_allowed(error_code, new_spte)) + break; + + /* + * Currently, fast page fault only works for direct mapping + * since the gfn is not stable for indirect shadow page. See + * Documentation/virtual/kvm/locking.txt to get more detail. + */ + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, + iterator.sptep, spte, + new_spte); + if (fault_handled) + break; + + if (++retry_count > 4) { + printk_once(KERN_WARNING + "kvm: Fast #PF retrying more than 4 times.\n"); + break; + } + + } while (true); - /* - * Currently, fast page fault only works for direct mapping since - * the gfn is not stable for indirect shadow page. - * See Documentation/virtual/kvm/locking.txt to get more detail. - */ - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); -exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); + spte, fault_handled); walk_shadow_page_lockless_end(vcpu); - return ret; + return fault_handled; } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, @@ -3909,7 +4103,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * as a SMAP violation if all of the following * conditions are ture: * - X86_CR4_SMAP is set in CR4 - * - An user page is accessed + * - A user page is accessed * - Page fault in kernel mode * - if CPL = 3 or X86_EFLAGS_AC is clear * @@ -4405,7 +4599,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) } static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -4508,7 +4703,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4527,12 +4722,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: @@ -4617,11 +4828,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); } @@ -4958,7 +5177,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) * zap all shadow pages. */ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { - printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } @@ -5046,6 +5265,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..37942e419c32 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -14,6 +14,8 @@ */ #include <linux/kvm_host.h> +#include <linux/rculist.h> + #include <asm/kvm_host.h> #include <asm/kvm_page_track.h> @@ -106,6 +108,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception @@ -135,6 +138,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. @@ -181,6 +185,7 @@ kvm_page_track_register_notifier(struct kvm *kvm, hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of @@ -199,6 +204,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } +EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is @@ -222,6 +228,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes); + n->track_write(vcpu, gpa, new, bytes, n); + srcu_read_unlock(&head->track_srcu, idx); +} + +/* + * Notify the node that memory slot is being removed or moved so that it can + * drop write-protection for the pages in the memory slot. + * + * The node should figure out it has any write-protected pages in this slot + * by itself. + */ +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_flush_slot) + n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca5038d..d1efe2c62b3f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) * a particular vCPU. */ #define SVM_VM_DATA_HASH_BITS 8 -DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); -static spinlock_t svm_vm_data_hash_lock; +static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static DEFINE_SPINLOCK(svm_vm_data_hash_lock); /* Note: * This function is called from IOMMU driver to notify @@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void) } else { pr_info("AVIC enabled\n"); - hash_init(svm_vm_data_hash); - spin_lock_init(&svm_vm_data_hash_lock); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } } @@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.fpu_active = 1; svm->vcpu.arch.hflags = 0; set_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm) ulong gcr0 = svm->vcpu.arch.cr0; u64 *hcr0 = &svm->vmcb->save.cr0; - if (!svm->vcpu.fpu_active) - *hcr0 |= SVM_CR0_SELECTIVE_MASK; - else - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); mark_dirty(svm->vmcb, VMCB_CR); - if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); } else { @@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!npt_enabled) cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) - cr0 |= X86_CR0_TS; /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -2074,7 +2066,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) { @@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static void svm_fpu_activate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - - svm->vcpu.fpu_active = 1; - update_cr0_intercept(svm); -} - -static int nm_interception(struct vcpu_svm *svm) -{ - svm_fpu_activate(&svm->vcpu); - return 1; -} - static bool is_erratum_383(void) { int err, i; @@ -2270,7 +2246,7 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; @@ -2278,7 +2254,8 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + return in ? kvm_fast_pio_in(vcpu, size, port) + : kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -2572,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) if (!npt_enabled && svm->apf_reason == 0) return NESTED_EXIT_HOST; break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; default: break; } @@ -3150,8 +3124,7 @@ static int skinit_interception(struct vcpu_svm *svm) static int wbinvd_interception(struct vcpu_svm *svm) { - kvm_emulate_wbinvd(&svm->vcpu); - return 1; + return kvm_emulate_wbinvd(&svm->vcpu); } static int xsetbv_interception(struct vcpu_svm *svm) @@ -3238,8 +3211,7 @@ static int task_switch_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; + return kvm_emulate_cpuid(&svm->vcpu); } static int iret_interception(struct vcpu_svm *svm) @@ -3275,9 +3247,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, @@ -3374,9 +3344,7 @@ static int cr_interception(struct vcpu_svm *svm) } kvm_register_write(&svm->vcpu, reg, val); } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static int dr_interception(struct vcpu_svm *svm) @@ -4025,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, @@ -4187,6 +4154,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -4362,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - return; -} - static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) { kvm_lapic_set_irr(vec, vcpu->arch.apic); @@ -5082,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void) return true; } -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - set_exception_intercept(svm, NM_VECTOR); - update_cr0_intercept(svm); -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -5350,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_pkru = svm_get_pkru, - .fpu_activate = svm_fpu_activate, - .fpu_deactivate = svm_fpu_deactivate, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, @@ -5376,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, - .sync_pir_to_irr = svm_sync_pir_to_irr, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5382b82462fc..ef4ba71dbb66 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -133,6 +133,16 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 /* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. @@ -446,23 +456,31 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; u32 nested_vmx_secondary_ctls_low; u32 nested_vmx_secondary_ctls_high; u32 nested_vmx_pinbased_ctls_low; u32 nested_vmx_pinbased_ctls_high; u32 nested_vmx_exit_ctls_low; u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; u32 nested_vmx_entry_ctls_low; u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -520,6 +538,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -920,16 +944,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, + VMX_MSR_BITMAP_LEGACY, + VMX_MSR_BITMAP_LONGMODE, + VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, + VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, + VMX_MSR_BITMAP_LEGACY_X2APIC, + VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) +#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -1343,10 +1383,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; } -static inline bool is_exception(u32 intr_info) +static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); } static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, @@ -1816,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -1825,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - if (vcpu->fpu_active) - eb &= ~(1u << NM_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -1952,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host[i].value = host_val; } -static void reload_tss(void) -{ - /* - * VT restores TR but not its size. Useless. - */ - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - struct desc_struct *descs; - - descs = (void *)gdt->address; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -} - static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { u64 guest_efer = vmx->vcpu.arch.efer; @@ -2019,41 +2044,36 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) } } +#ifdef CONFIG_X86_32 +/* + * On 32-bit kernels, VM exits still load the FS and GS bases from the + * VMCS rather than the segment table. KVM uses this helper to figure + * out the current bases to poke them into the VMCS before entry. + */ static unsigned long segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; - unsigned long table_base; + struct desc_struct *table; unsigned long v; - if (!(selector & ~3)) + if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table_base = gdt->address; + table = (struct desc_struct *)gdt->address; - if (selector & 4) { /* from ldt */ + if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); - if (!(ldt_selector & ~3)) + if (!(ldt_selector & ~SEGMENT_RPL_MASK)) return 0; - table_base = segment_base(ldt_selector); + table = (struct desc_struct *)segment_base(ldt_selector); } - d = (struct desc_struct *)(table_base + (selector & ~7)); - v = get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; -#endif + v = get_desc_base(&table[selector >> 3]); return v; } - -static inline unsigned long kvm_read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} +#endif static void vmx_save_host_state(struct kvm_vcpu *vcpu) { @@ -2139,18 +2159,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) loadsegment(es, vmx->host_state.es_sel); } #endif - reload_tss(); + invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - /* - * If the FPU is not active (through the host task or - * the guest vcpu), then restore the cr0.TS bit. - */ - if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) - stts(); load_gdt(this_cpu_ptr(&host_gdt)); } @@ -2260,10 +2274,19 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. + * processors. See 22.2.4. */ - vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + vmcs_writel(HOST_TR_BASE, + (unsigned long)this_cpu_ptr(&cpu_tss)); + vmcs_writel(HOST_GDTR_BASE, gdt->address); + + /* + * VM exits change the host TR limit to 0x67 after a VM + * exit. This is okay, since 0x67 covers everything except + * the IO bitmap and have have code to handle the IO bitmap + * being lost after a VM exit. + */ + BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ @@ -2306,25 +2329,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) } } -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - ulong cr0; - - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - cr0 = vmcs_readl(GUEST_CR0); - cr0 &= ~(X86_CR0_TS | X86_CR0_MP); - cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); - vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - if (is_guest_mode(vcpu)) - vcpu->arch.cr0_guest_owned_bits &= - ~get_vmcs12(vcpu)->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2343,33 +2347,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields) (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - /* Note that there is no vcpu->fpu_active = 0 here. The caller must - * set this *before* calling this function. - */ - vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = 0; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (is_guest_mode(vcpu)) { - /* - * L1's specified read shadow might not contain the TS bit, - * so now that we turned on shadowing of this bit, we need to - * set this bit of the shadow. Like in nested_vmx_run we need - * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet - * up-to-date here because we just decached cr0.TS (and we'll - * only update vmcs12->guest_cr0 on nested exit). - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | - (vcpu->arch.cr0 & X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - } else - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -} - static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; @@ -2529,14 +2506,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; } else { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; } } else { if (is_long_mode(vcpu)) @@ -2712,9 +2689,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_true_exit_ctls_low = - vmx->nested.nested_vmx_exit_ctls_low & - ~VM_EXIT_SAVE_DEBUG_CONTROLS; + vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2733,9 +2708,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_true_entry_ctls_low = - vmx->nested.nested_vmx_entry_ctls_low & - ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2768,8 +2741,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_true_procbased_ctls_low = - vmx->nested.nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ @@ -2780,6 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2811,8 +2784,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + VMX_VPID_EXTENT_SUPPORTED_MASK; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -2829,14 +2801,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; + return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) @@ -2844,87 +2854,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: + /* + * The rest of the VMX capability MSRs do not support restore. + */ + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3107,7 +3315,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3693,7 +3905,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) } vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); + vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } @@ -3869,6 +4081,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -3929,9 +4175,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS | X86_CR0_MP; - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; @@ -3997,8 +4240,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!nested_vmx_allowed(vcpu)) return 1; } - if (to_vmx(vcpu)->nested.vmxon && - ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -4575,41 +4818,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4665,48 +4873,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) { if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); + msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); + msr, type); } } @@ -4715,7 +4893,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4726,19 +4904,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4749,7 +4923,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4818,20 +4991,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; - r = pi_test_and_set_on(&vmx->pi_desc); - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) - kvm_vcpu_kick(vcpu); -} - -static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!pi_test_and_clear_on(&vmx->pi_desc)) + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(&vmx->pi_desc)) return; - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) + kvm_vcpu_kick(vcpu); } /* @@ -4845,9 +5010,11 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr4; + unsigned long cr0, cr4; - vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ @@ -4990,10 +5157,8 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -5096,7 +5261,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + set_cr4_guest_host_mask(vmx); if (vmx_xsaves_supported()) @@ -5200,7 +5367,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); @@ -5234,26 +5401,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) static void enable_irq_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis() || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -5476,14 +5637,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) + if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - if (is_invalid_opcode(intr_info)) { if (is_guest_mode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5587,7 +5743,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string; + int size, in, string, ret; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5601,9 +5757,14 @@ static int handle_io(struct kvm_vcpu *vcpu) port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - return kvm_fast_pio_out(vcpu, size, port); + ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_fast_pio_out(vcpu, size, port) && ret; } static void @@ -5617,18 +5778,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - always_on &= ~(X86_CR0_PE | X86_CR0_PG); - return (val & always_on) == always_on; -} - /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -5647,7 +5796,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vcpu, val)) + if (!nested_guest_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5656,8 +5805,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) return 0; } else { if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + !nested_host_cr0_valid(vcpu, val)) return 1; + return kvm_set_cr0(vcpu, val); } } @@ -5679,28 +5829,13 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as appropriate for clts instruction exit. */ -static void handle_clts(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS - * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, - * just pretend it's off (also in arch.cr0 for fpu_activate). - */ - vmcs_writel(CR0_READ_SHADOW, - vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); - vcpu->arch.cr0 &= ~X86_CR0_TS; - } else - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -} - static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; int err; + int ret; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -5712,50 +5847,49 @@ static int handle_cr(struct kvm_vcpu *vcpu) switch (cr) { case 0: err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 3: err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) - return 1; + return ret; if (cr8_prev <= cr8) - return 1; + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } } break; case 2: /* clts */ - handle_clts(vcpu); + WARN_ONCE(1, "Guest should always own CR0.TS"); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); - vmx_fpu_activate(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ @@ -5763,8 +5897,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); default: break; } @@ -5835,8 +5968,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) @@ -5868,8 +6000,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) static int handle_cpuid(struct kvm_vcpu *vcpu) { - kvm_emulate_cpuid(vcpu); - return 1; + return kvm_emulate_cpuid(vcpu); } static int handle_rdmsr(struct kvm_vcpu *vcpu) @@ -5890,8 +6021,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) @@ -5911,24 +6041,19 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) } trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_update_ppr(vcpu); return 1; } static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5956,8 +6081,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_rdpmc(struct kvm_vcpu *vcpu) @@ -5965,15 +6089,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) int err; err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; + return kvm_complete_insn_gp(vcpu, err); } static int handle_wbinvd(struct kvm_vcpu *vcpu) { - kvm_emulate_wbinvd(vcpu); - return 1; + return kvm_emulate_wbinvd(vcpu); } static int handle_xsetbv(struct kvm_vcpu *vcpu) @@ -5982,20 +6103,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); return 1; } static int handle_xsaves(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } @@ -6016,8 +6137,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } } return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -6144,15 +6264,22 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* it is a read fault? */ - error_code = (exit_qualification << 2) & PFERR_USER_MASK; - /* it is a write fault? */ - error_code |= exit_qualification & PFERR_WRITE_MASK; - /* It is a fetch fault? */ - error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; - /* ept page table is present? */ - error_code |= (exit_qualification & 0x38) != 0; - + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; + + vcpu->arch.gpa_available = true; vcpu->arch.exit_qualification = exit_qualification; return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -6165,12 +6292,12 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - skip_emulated_instruction(vcpu); trace_kvm_fast_mmio(gpa); - return 1; + return kvm_skip_emulated_instruction(vcpu); } ret = handle_mmio_page_fault(vcpu, gpa, true); + vcpu->arch.gpa_available = true; if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; @@ -6192,12 +6319,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - /* clear pending NMI */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -6343,6 +6466,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6352,50 +6488,13 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return r; + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out7; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out8; - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6413,7 +6512,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6459,8 +6558,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apicv()) + if (!cpu_has_vmx_apicv()) { enable_apicv = 0; + kvm_x86_ops->sync_pir_to_irr = NULL; + } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; @@ -6476,50 +6577,38 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - memcpy(vmx_msr_bitmap_legacy_x2apic, + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + for (msr = 0x800; msr <= 0x8ff; msr++) { + if (msr == 0x839 /* TMCCT */) + continue; + vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); + } + /* - * enable_apicv && kvm_vcpu_apicv_active() + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); + vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); + vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); @@ -6555,42 +6644,19 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); return r; } static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + int i; + + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); free_kvm_area(); } @@ -6604,16 +6670,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); - - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_nop(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_mwait(struct kvm_vcpu *vcpu) @@ -6920,49 +6983,48 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } page = nested_get_page(vcpu, vmptr); - if (page == NULL || - *(u32 *)kmap(page) != VMCS12_REVISION) { + if (page == NULL) { nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); - skip_emulated_instruction(vcpu); - return 1; + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); } kunmap(page); + nested_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case EXIT_REASON_VMPTRLD: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + VMXERR_VMPTRLD_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); } break; default: @@ -6974,6 +7036,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, return 0; } +static int enter_vmx_operation(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *shadow_vmcs; + + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) + goto out_shadow_vmcs; + /* mark vmcs as shadow */ + shadow_vmcs->revision_id |= (1u << 31); + /* init shadow vmcs */ + vmcs_clear(shadow_vmcs); + vmx->vmcs01.shadow_vmcs = shadow_vmcs; + } + + INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); + vmx->nested.vmcs02_num = 0; + + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + + vmx->nested.vmxon = true; + return 0; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -6984,9 +7093,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ static int handle_vmon(struct kvm_vcpu *vcpu) { + int ret; struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -7013,13 +7122,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) - return 1; - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) @@ -7028,49 +7133,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (cpu_has_vmx_msr_bitmap()) { - vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx->nested.msr_bitmap) - goto out_msr_bitmap; - } - - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_vmcs12) - goto out_cached_vmcs12; - - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } - - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - - vmx->nested.vmxon = true; + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; - -out_shadow_vmcs: - kfree(vmx->nested.cached_vmcs12); - -out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); - -out_msr_bitmap: - return -ENOMEM; + return kvm_skip_emulated_instruction(vcpu); } /* @@ -7180,9 +7251,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -7221,9 +7291,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -7421,7 +7490,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == -1ull) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); return 0; } return 1; @@ -7435,17 +7503,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* * Now copy part of this value to register or memory, as requested. @@ -7465,8 +7534,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } @@ -7485,10 +7553,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); @@ -7508,19 +7578,28 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); +} + +static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + vmx->nested.current_vmptr = vmptr; + if (enable_shadow_vmcs) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->vmcs01.shadow_vmcs)); + vmx->nested.sync_shadow_vmcs = true; + } } /* Emulate the VMPTRLD instruction */ @@ -7541,8 +7620,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) page = nested_get_page(vcpu, vmptr); if (page == NULL) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { @@ -7550,12 +7628,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; /* @@ -7564,19 +7640,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) */ memcpy(vmx->nested.cached_vmcs12, vmx->nested.current_vmcs12, VMCS12_SIZE); - - if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.sync_shadow_vmcs = true; - } + set_current_vmptr(vmx, vmptr); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRST instruction */ @@ -7601,8 +7669,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the INVEPT instruction */ @@ -7640,8 +7707,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* According to the Intel VMX instruction reference, the memory @@ -7672,8 +7738,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) break; } - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -7698,13 +7763,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + types = (vmx->nested.nested_vmx_vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* according to the intel vmx instruction reference, the memory @@ -7720,23 +7785,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: case VMX_VPID_EXTENT_SINGLE_CONTEXT: - /* - * Old versions of KVM use the single-context version so we - * have to support it; just treat it the same as all-context. - */ + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!vpid) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); - nested_vmx_succeed(vcpu); break; default: - /* Trap individual address invalidation invvpid calls */ - BUG_ON(1); - break; + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); } - skip_emulated_instruction(vcpu); - return 1; + __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + nested_vmx_succeed(vcpu); + + return kvm_skip_emulated_instruction(vcpu); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -8018,7 +8086,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: - if (!is_exception(intr_info)) + if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) return enable_ept; @@ -8045,8 +8113,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -8075,6 +8141,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -8202,7 +8270,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), + name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); @@ -8366,6 +8434,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + vcpu->arch.gpa_available = false; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more @@ -8584,6 +8653,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + + WARN_ON(!vcpu->arch.apicv_active); + if (pi_test_on(&vmx->pi_desc)) { + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); + max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + } else { + max_irr = kvm_lapic_find_highest_irr(vcpu); + } + vmx_hwapic_irr_update(vcpu, max_irr); + return max_irr; +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -8595,6 +8685,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_clear_on(&vmx->pi_desc); + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8611,8 +8709,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { + if (is_nmi(exit_intr_info)) { kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); @@ -8624,11 +8721,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); register void *__sp asm(_ASM_SP); - /* - * If external interrupt exists, IF bit is set in rflags/eflags on the - * interrupt stack frame, and interrupt will be enabled on a return - * from interrupt handler. - */ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; @@ -8813,7 +8905,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; @@ -9283,6 +9375,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) (new_ctl & ~mask) | (cur_ctl & mask)); } +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; + vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ + cr4_fixed1_update(bit(11), ecx, bit(2)); + +#undef cr4_fixed1_update +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -9324,6 +9460,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9399,17 +9538,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } -static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 hpa; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || - vmcs12->apic_access_addr >> maxphyaddr) - return false; - /* * Translate L1 physical address to host physical * address for vmcs02. Keep the page pinned, so this @@ -9420,59 +9558,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, nested_release_page(vmx->nested.apic_access_page); vmx->nested.apic_access_page = nested_get_page(vcpu, vmcs12->apic_access_addr); + /* + * If translation failed, no matter: This feature asks + * to exit when accessing the given address, and if it + * can never be accessed, this feature won't do + * anything anyway. + */ + if (vmx->nested.apic_access_page) { + hpa = page_to_phys(vmx->nested.apic_access_page); + vmcs_write64(APIC_ACCESS_ADDR, hpa); + } else { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + } + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + kvm_vcpu_reload_apic_access_page(vcpu); } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || - vmcs12->virtual_apic_page_addr >> maxphyaddr) - return false; - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); /* - * Failing the vm entry is _not_ what the processor does - * but it's basically the only possibility we have. - * We could still enter the guest if CR8 load exits are - * enabled, CR8 store exits are enabled, and virtualize APIC - * access is disabled; in this case the processor would never - * use the TPR shadow and we could simply clear the bit from - * the execution control. But such a configuration is useless, - * so let's keep the code simple. + * If translation failed, VM entry will fail because + * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. + * Failing the vm entry is _not_ what the processor + * does but it's basically the only possibility we + * have. We could still enter the guest if CR8 load + * exits are enabled, CR8 store exits are enabled, and + * virtualize APIC access is disabled; in this case + * the processor would never use the TPR shadow and we + * could simply clear the bit from the execution + * control. But such a configuration is useless, so + * let's keep the code simple. */ - if (!vmx->nested.virtual_apic_page) - return false; + if (vmx->nested.virtual_apic_page) { + hpa = page_to_phys(vmx->nested.virtual_apic_page); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } } if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || - vmcs12->posted_intr_desc_addr >> maxphyaddr) - return false; - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ kunmap(vmx->nested.pi_desc_page); nested_release_page(vmx->nested.pi_desc_page); } vmx->nested.pi_desc_page = nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); - if (!vmx->nested.pi_desc_page) - return false; - vmx->nested.pi_desc = (struct pi_desc *)kmap(vmx->nested.pi_desc_page); if (!vmx->nested.pi_desc) { nested_release_page_clean(vmx->nested.pi_desc_page); - return false; + return; } vmx->nested.pi_desc = (struct pi_desc *)((void *)vmx->nested.pi_desc + (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); } - - return true; + if (cpu_has_vmx_msr_bitmap() && + nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; + else + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); } static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) @@ -9541,11 +9700,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; } msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -9778,6 +9932,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + u32 *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -9786,11 +9983,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -9829,21 +10030,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); + if (from_vmentry) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + } else { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, @@ -9872,12 +10078,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } else + } else { exec_control &= ~PIN_BASED_POSTED_INTR; + } vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); @@ -9922,26 +10125,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ - if (!vmx->nested.apic_access_page) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - else - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->nested.apic_access_page)); - } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { - exec_control |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - kvm_vcpu_reload_apic_access_page(vcpu); - } - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); @@ -9955,6 +10138,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_intr_status); } + nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; + + /* + * Write an illegal value to APIC_ACCESS_ADDR. Later, + * nested_get_vmcs12_pages will either fix it up or + * remove the VM execution control. + */ + if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + vmcs_write64(APIC_ACCESS_ADDR, -1ull); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9968,6 +10161,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_constant_host_state(vmx); /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant @@ -9982,19 +10184,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); } - if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) - ; /* MSR_BITMAP will be set by following vmx_set_efer. */ - else - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; - /* * Merging of IO bitmap not currently supported. * Rather, exit every time. @@ -10026,16 +10225,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; - } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - + } set_cr4_guest_host_mask(vmx); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + if (from_vmentry && + vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) @@ -10073,18 +10274,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) nested_ept_init_mmu_context(vcpu); } - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified - * TS bit (for lazy fpu) and bits which we consider mandatory enabled. + * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those + * bits which we consider mandatory enabled. * The CR0_READ_SHADOW is what L2 should have expected to read given * the specifications by L1; It's not enough to take * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we @@ -10096,8 +10288,21 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + entry_failure_code)) + return 1; + kvm_mmu_reset_context(vcpu); if (!enable_ept) @@ -10115,76 +10320,28 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; } -/* - * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 - * for running an L2 nested guest. - */ -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - struct loaded_vmcs *vmcs02; - bool ia32e; - u32 msr_entry_idx; - - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) - return 1; - - skip_emulated_instruction(vcpu); - vmcs12 = get_vmcs12(vcpu); - - if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); - - /* - * The nested entry process starts with enforcing various prerequisites - * on vmcs12 as required by the Intel SDM, and act appropriately when - * they fail: As the SDM explains, some conditions should cause the - * instruction to fail, while others will cause the instruction to seem - * to succeed, but return an EXIT_REASON_INVALID_STATE. - * To speed up the normal (success) code path, we should avoid checking - * for misconfigurations which will anyway be caught by the processor - * when using the merged vmcs02. - */ - if (vmcs12->launch_state == launch) { - nested_vmx_failValid(vcpu, - launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS - : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; - } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.nested_vmx_secondary_ctls_low, @@ -10193,32 +10350,35 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) - { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { - nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; - } + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 0; +} + +static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *exit_qual) +{ + bool ia32e; + + *exit_qual = ENTRY_FAIL_DEFAULT; + + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - } - if (vmcs12->vmcs_link_pointer != -1ull) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); + + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && + vmcs12->vmcs_link_pointer != -1ull) { + *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } @@ -10231,16 +10391,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1. */ - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) return 1; - } } /* @@ -10254,17 +10412,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) return 1; - } } - /* - * We're finally done with prerequisite checking, and can start with - * the nested entry. - */ + return 0; +} + +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct loaded_vmcs *vmcs02; + int cpu; + u32 msr_entry_idx; + u32 exit_qual; vmcs02 = nested_get_current_vmcs02(vmx); if (!vmcs02) @@ -10284,7 +10446,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - prepare_vmcs02(vcpu, vmcs12); + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qual); + return 1; + } + + nested_get_vmcs12_pages(vcpu, vmcs12); msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, @@ -10299,18 +10469,94 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12->launch_state = 1; - if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) - return kvm_vcpu_halt(vcpu); - - vmx->nested.nested_run_pending = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ + return 0; +} + +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_qual; + int ret; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + + vmcs12 = get_vmcs12(vcpu); + + if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (vmcs12->launch_state == launch) { + nested_vmx_failValid(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + goto out; + } + + ret = check_vmentry_prereqs(vcpu, vmcs12); + if (ret) { + nested_vmx_failValid(vcpu, ret); + goto out; + } + + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions; don't call kvm_skip_emulated_instruction. + * This is not 100% correct; for performance reasons, we delegate most + * of the checks on host state to the processor. If those fail, + * the singlestep trap is missed. + */ + skip_emulated_instruction(vcpu); + + ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual); + if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qual); + return 1; + } + + /* + * We're finally done with prerequisite checking, and can start with + * the nested entry. + */ + + ret = enter_vmx_non_root_mode(vcpu, true); + if (ret) + return ret; + + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + return kvm_vcpu_halt(vcpu); + + vmx->nested.nested_run_pending = 1; + return 1; + +out: + return kvm_skip_emulated_instruction(vcpu); } /* @@ -10428,7 +10674,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -10446,21 +10693,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) } /* - * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits - * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), - * and this function updates it to reflect the changes to the guest state while - * L2 was running (and perhaps made some exits which were handled directly by L0 - * without going back to L1), and to reflect the exit reason. - * Note that we do not have to copy here all VMCS fields, just those that - * could have changed by the L2 guest or the exit - i.e., the guest-state and - * exit-information fields only. Other fields are modified by L1 with VMWRITE, - * which already writes to vmcs12 directly. + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) +static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -10566,6 +10805,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) +{ + /* update guest state fields: */ + sync_vmcs12(vcpu, vmcs12); /* update exit information fields: */ @@ -10616,6 +10874,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -10630,31 +10889,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't - * actually changed, because it depends on the current state of - * fpu_active (which may have changed). - * Note that vmx_set_cr0 refers to efer set above. + * actually changed, because vmx_set_cr0 refers to efer set above. + * + * CR0_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it); */ + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; vmx_set_cr0(vcpu, vmcs12->host_cr0); - /* - * If we did fpu_activate()/fpu_deactivate() during L2's run, we need - * to apply the same changes to L1's vmcs. We just set cr0 correctly, - * but we also need to update cr0_guest_host_mask and exception_bitmap. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* - * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); - */ + /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; @@ -10755,6 +11009,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -10767,6 +11022,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (unlikely(vmx->fail)) + vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -10795,6 +11053,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -10843,7 +11103,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vm_inst_error); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) @@ -11266,9 +11526,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_pkru = vmx_get_pkru, - .fpu_activate = vmx_fpu_activate, - .fpu_deactivate = vmx_fpu_deactivate, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, @@ -11293,6 +11550,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_enable_apicv = vmx_get_enable_apicv, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .sync_pir_to_irr = vmx_sync_pir_to_irr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04c5d96b1d67..1faf620a6fdc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -54,6 +54,8 @@ #include <linux/pvclock_gtod.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> +#include <linux/sched/stat.h> + #include <trace/events/kvm.h> #include <asm/debugreg.h> @@ -180,6 +182,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -190,6 +193,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; @@ -434,12 +439,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); @@ -573,7 +580,7 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -static bool pdptrs_changed(struct kvm_vcpu *vcpu) +bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; @@ -599,6 +606,7 @@ out: return changed; } +EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1128,14 +1136,15 @@ struct pvclock_gtod_data { struct { /* extract of a clocksource struct */ int vclock_mode; - cycle_t cycle_last; - cycle_t mask; + u64 cycle_last; + u64 mask; u32 mult; u32 shift; } clock; u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1159,6 +1168,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1569,9 +1580,9 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) #ifdef CONFIG_X86_64 -static cycle_t read_tsc(void) +static u64 read_tsc(void) { - cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 ret = (u64)rdtsc_ordered(); u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) @@ -1589,7 +1600,7 @@ static cycle_t read_tsc(void) return last; } -static inline u64 vgettsc(cycle_t *cycle_now) +static inline u64 vgettsc(u64 *cycle_now) { long v; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -1600,7 +1611,7 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod->clock.mult; } -static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, u64 *cycle_now) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1620,8 +1631,30 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) @@ -1629,6 +1662,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -1769,7 +1813,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1790,9 +1834,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1806,16 +1850,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2048,7 +2092,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2067,16 +2111,18 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; + vcpu->arch.st.steal.preempted = 0; + if (vcpu->arch.st.steal.version & 1) vcpu->arch.st.steal.version += 1; /* first time write, random junk */ vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2085,14 +2131,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2176,7 +2222,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); @@ -2198,9 +2243,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - gpa_offset = data & ~(PAGE_MASK | 1); - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2221,7 +2264,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2294,7 +2337,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { @@ -2506,7 +2549,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", + msr_info->index); return 1; } else { vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); @@ -2630,6 +2674,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: + case KVM_CAP_IMMEDIATE_EXIT: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2810,7 +2855,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (kvm_lapic_hv_timer_in_use(vcpu) && kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_get_lapic_target_expiration_tsc(vcpu))) kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update @@ -2826,8 +2871,39 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } +static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + vcpu->arch.st.steal.preempted = 1; + + kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + &vcpu->arch.st.steal.preempted, + offsetof(struct kvm_steal_time, preempted), + sizeof(vcpu->arch.st.steal.preempted)); +} + void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + /* + * Disable page faults because we're in atomic context here. + * kvm_write_guest_offset_cached() would call might_fault() + * that relies on pagefault_disable() to tell if there's a + * bug. NOTE: the write to guest memory may not go through if + * during postcopy live migration or if there's heavy guest + * paging. + */ + pagefault_disable(); + /* + * kvm_memslots() will be called by + * kvm_write_guest_offset_cached() so take the srcu lock. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -2836,7 +2912,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); @@ -3036,6 +3112,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } +static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); + static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -3072,10 +3150,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + u32 hflags = vcpu->arch.hflags; if (events->smi.smm) - vcpu->arch.hflags |= HF_SMM_MASK; + hflags |= HF_SMM_MASK; else - vcpu->arch.hflags &= ~HF_SMM_MASK; + hflags &= ~HF_SMM_MASK; + kvm_set_hflags(vcpu, hflags); + vcpu->arch.smi_pending = events->smi.pending; if (events->smi.smm_inside_nmi) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; @@ -3143,6 +3224,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) memcpy(dest, xsave, XSAVE_HDR_OFFSET); /* Set XSTATE_BV */ + xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; /* @@ -3303,6 +3385,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, switch (cap->cap) { case KVM_CAP_HYPERV_SYNIC: + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; return kvm_hv_activate_synic(vcpu); default: return -EINVAL; @@ -3855,7 +3939,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3918,40 +4002,41 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); - kvm->arch.vpic = vpic; + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -3987,7 +4072,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4011,7 +4096,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4420,6 +4505,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4446,16 +4546,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4552,6 +4643,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -4816,7 +4923,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) +static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4836,8 +4943,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_emulate_wbinvd_noskip(vcpu); + kvm_emulate_wbinvd_noskip(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -5081,11 +5188,6 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); kvm_load_guest_fpu(emul_to_vcpu(ctxt)); - /* - * CR0.TS may reference the host fpu state, not the guest fpu state, - * so it may be clear at this point. - */ - clts(); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) @@ -5440,7 +5542,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never @@ -5453,6 +5554,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag } } +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int r = EMULATE_DONE; + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + return r == EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); + static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && @@ -5563,6 +5675,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) @@ -5638,6 +5753,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); +static int complete_fast_pio_in(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + /* We should only ever be called with arch.pio.count equal to 1 */ + BUG_ON(vcpu->arch.pio.count != 1); + + /* For size less than 4 we merge, else we zero extend */ + val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) + : 0; + + /* + * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * the copy and tracing + */ + emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + + return 1; +} + +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val; + int ret; + + /* For size less than 4 we merge, else we zero extend */ + val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + + ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, + &val, 1); + if (ret) { + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + return ret; + } + + vcpu->arch.complete_userspace_io = complete_fast_pio_in; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); @@ -5773,7 +5931,7 @@ static void kvm_timer_init(void) } pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); } @@ -5834,9 +5992,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; @@ -5935,7 +6090,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -5958,6 +6113,7 @@ out: void kvm_arch_exit(void) { + kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -5987,11 +6143,44 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_vcpu_halt(vcpu); + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_vcpu_halt(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +#ifdef CONFIG_X86_64 +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + u64 cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} +#endif + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6019,9 +6208,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; + int op_64_bit, r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_skip_emulated_instruction(vcpu); if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -6056,6 +6245,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; +#ifdef CONFIG_X86_64 + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; +#endif default: ret = -KVM_ENOSYS; break; @@ -6077,7 +6271,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, + &ctxt->exception); } static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) @@ -6468,7 +6663,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -6559,10 +6754,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ vcpu->arch.apf.halted = true; @@ -6622,21 +6813,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); } - /* - * KVM_REQ_EVENT is not set when posted interrupts are set by - * VT-d hardware, so we have to update RVI unconditionally. - */ - if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (vcpu->arch.apicv_active) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); - } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; @@ -6677,22 +6855,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); + kvm_load_guest_fpu(vcpu); + + /* + * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt + * IPI are then delayed after guest entry, which ensures that they + * result in virtual interrupt delivery. + */ + local_irq_disable(); vcpu->mode = IN_GUEST_MODE; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); /* - * We should set ->mode before check ->requests, - * Please see the comment in kvm_make_all_cpus_request. - * This also orders the write to mode from any reads - * to the page tables done while the VCPU is running. - * Please see the comment in kvm_flush_remote_tlbs. + * 1) We should set ->mode before checking ->requests. Please see + * the comment in kvm_make_all_cpus_request. + * + * 2) For APICv, we should set ->mode before checking PIR.ON. This + * pairs with the memory barrier implicit in pi_test_and_set_on + * (see vmx_deliver_posted_interrupt). + * + * 3) This also orders the write to mode from any reads to the page + * tables done while the VCPU is running. Please see the comment + * in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); - local_irq_disable(); + /* + * This handles the case where a posted interrupt was + * notified with kvm_vcpu_kick. + */ + if (kvm_lapic_enabled(vcpu)) { + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + } if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { @@ -6831,6 +7027,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); } @@ -7002,7 +7201,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - r = vcpu_run(vcpu); + if (kvm_run->immediate_exit) + r = -EINTR; + else + r = vcpu_run(vcpu); out: post_kvm_run_save(vcpu); @@ -7407,25 +7609,13 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; + if (!vcpu->guest_fpu_loaded) return; - } vcpu->guest_fpu_loaded = 0; copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } trace_kvm_fpu(0); } @@ -7824,6 +8014,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + mutex_init(&kvm->arch.hyperv.hv_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); @@ -8176,7 +8367,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_page_track_flush_slot(kvm, slot); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -8208,9 +8399,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } @@ -8347,9 +8535,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 25da5bc8d83d..d3289d7e78fa 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -497,38 +497,24 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * a whole series of functions like read_cr0() and write_cr0(). * * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but Linux only really cares about one: the horrifically-named Task - * Switched (TS) bit at bit 3 (ie. 8) + * features, but the only cr0 bit that Linux ever used at runtime was the + * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch, and Linux uses that gratefully, but wouldn't a - * name like "FPUTRAP bit" be a little less cryptic? + * lazily after a task switch if we wanted to, but wouldn't a name like + * "FPUTRAP bit" be a little less cryptic? * - * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. + * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore + * cr0. */ -static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { - lazy_hcall1(LHCALL_TS, val & X86_CR0_TS); - current_cr0 = val; } static unsigned long lguest_read_cr0(void) { - return current_cr0; -} - -/* - * Intel provided a special instruction to clear the TS bit for people too cool - * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. - */ -static void lguest_clts(void) -{ - lazy_hcall1(LHCALL_TS, 0); - current_cr0 &= ~X86_CR0_TS; + return 0; } /* @@ -930,7 +916,7 @@ static unsigned long lguest_tsc_khz(void) * If we can't use the TSC, the kernel falls back to our lower-priority * "lguest_clock", where we read the time value given to us by the Host. */ -static cycle_t lguest_clock_read(struct clocksource *cs) +static u64 lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; @@ -1432,7 +1418,6 @@ __init void lguest_init(void) pv_cpu_ops.load_tls = lguest_load_tls; pv_cpu_ops.get_debugreg = lguest_get_debugreg; pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; pv_cpu_ops.write_cr0 = lguest_write_cr0; pv_cpu_ops.read_cr4 = lguest_read_cr4; diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b48f88..c5959576c315 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,53 +16,6 @@ #include <asm/smap.h> #include <asm/export.h> -/* Standard copy_to_user with segment limit checking */ -ENTRY(_copy_to_user) - mov PER_CPU_VAR(current_task), %rax - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_to_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_to_user) -EXPORT_SYMBOL(_copy_to_user) - -/* Standard copy_from_user with segment limit checking */ -ENTRY(_copy_from_user) - mov PER_CPU_VAR(current_task), %rax - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_from_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_from_user) -EXPORT_SYMBOL(_copy_from_user) - - - .section .fixup,"ax" - /* must zero dest */ -ENTRY(bad_from_user) -bad_from_user: - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret -ENDPROC(bad_from_user) - .previous - /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 073d1f1a620b..a8e91ae89fb3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -156,13 +156,13 @@ EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { + unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; xloops *= 4; asm("mull %%edx" :"=d" (xloops), "=&a" (d0) - :"1" (xloops), "0" - (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); + :"1" (xloops), "0" (lpj * (HZ / 4))); __delay(++xloops); } diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index d1dee753b949..07764255b611 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -113,14 +113,14 @@ int msr_clear_bit(u32 msr, u8 bit) } #ifdef CONFIG_TRACEPOINTS -void do_trace_write_msr(unsigned msr, u64 val, int failed) +void do_trace_write_msr(unsigned int msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); -void do_trace_read_msr(unsigned msr, u64 val, int failed) +void do_trace_read_msr(unsigned int msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index b4908789484e..c074799bddae 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -34,3 +34,52 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +/** + * copy_to_user: - Copy a block of data into user space. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from kernel space to user space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(_copy_to_user); + +/** + * copy_from_user: - Copy a block of data from user space. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from user space to kernel space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + */ +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) +{ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} +EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3bc7baf2a711..1f65ff6540f0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/interrupt.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmx.h> #include <asm/asm.h> @@ -640,52 +640,3 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 9e6545f269e5..2ccc424a57d9 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -19,7 +19,7 @@ #include <linux/signal.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_emu.h" #include "fpu_system.h" diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index e945fedf1de2..0203baefb5c0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -27,7 +27,7 @@ #include <linux/signal.h> #include <linux/regset.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/traps.h> #include <asm/user.h> #include <asm/fpu/internal.h> diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 8db26591d91a..b8ef9f9d2ffc 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -19,7 +19,7 @@ #include <linux/stddef.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/vm86.h> #include "fpu_system.h" diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 95228ff042c0..1643054766eb 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -18,7 +18,7 @@ | other processes using the emulator while swapping is in progress. | +---------------------------------------------------------------------------*/ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index d597fe7423c9..2c98965a60ba 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -19,7 +19,7 @@ #include "fpu_emu.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ea9c49adaa1f..58b5bee7ea27 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -15,8 +15,10 @@ #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/init.h> +#include <linux/sched.h> #include <linux/seq_file.h> +#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -50,6 +52,10 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif # ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, # endif @@ -75,6 +81,10 @@ static struct addr_marker address_markers[] = { { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, { 0/* VMALLOC_START */, "vmalloc() Area" }, { 0/* VMEMMAP_START */, "Vmemmap" }, +#ifdef CONFIG_KASAN + { KASAN_SHADOW_START, "KASAN shadow" }, + { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif @@ -326,18 +336,31 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 +/* + * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y + * KASAN fills page tables with the same values. Since there is no + * point in checking page table more than once we just skip repeated + * entries. This saves us dozens of seconds during boot. + */ +static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) +{ + return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); +} + static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; + pud_t *prev_pud = NULL; start = (pud_t *) pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start)) { + if (!pud_none(*start) && + !pud_already_checked(prev_pud, start, st->check_wx)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); note_page(m, st, __pgprot(prot), 2); @@ -348,6 +371,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } else note_page(m, st, __pgprot(0), 2); + prev_pud = start; start++; } } @@ -406,6 +430,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, } else note_page(m, &st, __pgprot(0), 1); + cond_resched(); start++; } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index fcd06f7526de..35ea061010a1 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,7 @@ #include <linux/extable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> + #include <asm/traps.h> #include <asm/kdebug.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..428e31763cb9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -4,6 +4,7 @@ * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/bootmem.h> /* max_low_pfn */ @@ -413,7 +414,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* @@ -679,8 +680,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); - printk(KERN_ALERT "IP:"); - printk_address(regs->ip); + printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 0d4fb3ebbbac..99c7805a9693 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -154,14 +154,12 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, +static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { int nr_start = *nr; - unsigned long pfn = pmd_pfn(pmd); struct dev_pagemap *pgmap = NULL; - pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; do { struct page *page = pfn_to_page(pfn); @@ -180,6 +178,24 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, return 1; } +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -251,9 +267,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, if (!pte_allows_gup(pud_val(pud), write)) return 0; + + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); + if (pud_devmap(pud)) + return __gup_device_huge_pud(pud, addr, end, pages, nr); + /* hugepages are never "special" */ VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; head = pud_page(pud); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2ae8584b44c7..c5066a260803 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cf8059016ec8..2b4b53e6793f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -34,7 +34,7 @@ #include <asm/asm.h> #include <asm/bios_ebda.h> #include <asm/processor.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/dma.h> #include <asm/fixmap.h> @@ -864,9 +864,6 @@ static noinline int do_test_wp_bit(void) return flag; } -const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); - int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) @@ -939,7 +936,6 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", size >> 10); - rodata_test(); #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd71d9e8..15173d37f399 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -36,7 +36,7 @@ #include <asm/processor.h> #include <asm/bios_ebda.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/dma.h> @@ -89,10 +89,10 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* - * When memory was added/removed make sure all the processes MM have + * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end, int removed) +void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long address; @@ -100,12 +100,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - /* - * When it is called after memory hot remove, pgd_none() - * returns true. In this case (removed == 1), we must clear - * the PGD entries in the local PGD level page. - */ - if (pgd_none(*pgd_ref) && !removed) + if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); @@ -122,13 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - if (removed) { - if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) - pgd_clear(pgd); - } else { - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - } + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); } @@ -596,7 +586,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, } if (pgd_changed) - sync_global_pgds(vaddr_start, vaddr_end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1); __flush_tlb_all(); @@ -689,7 +679,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->lru.next; + magic = (unsigned long)page->freelist; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); @@ -1010,9 +1000,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); - int kernel_set_to_readonly; void set_kernel_text_rw(void) @@ -1081,8 +1068,6 @@ void mark_rodata_ro(void) all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); - rodata_test(); - #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); @@ -1239,7 +1224,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1, 0); + sync_global_pgds(start, end - 1); return err; } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0493c17b8a51..8d63d7a104c3 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,6 +4,7 @@ #include <linux/kdebug.h> #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/vmalloc.h> #include <asm/tlbflush.h> diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index d2dc0438d654..7940166c799b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -28,7 +28,8 @@ #include <linux/mm.h> #include <linux/random.h> #include <linux/limits.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include <asm/elf.h> struct va_alignment __read_mostly va_align = { diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..5126dfd52b18 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -7,6 +7,7 @@ */ #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/sched/sysctl.h> @@ -51,7 +52,7 @@ static unsigned long mpx_mmap(unsigned long len) down_write(&mm->mmap_sem); addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL); up_write(&mm->mmap_sem); if (populate) mm_populate(addr, populate); @@ -293,7 +294,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } @@ -350,12 +351,12 @@ int mpx_enable_management(void) * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is * expected to be relatively expensive. Storing the bounds * directory here means that we do not have to do xsave in the - * unmap path; we can just use mm->bd_addr instead. + * unmap path; we can just use mm->context.bd_addr instead. */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); - mm->bd_addr = bd_base; - if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + mm->context.bd_addr = bd_base; + if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; up_write(&mm->mmap_sem); @@ -370,7 +371,7 @@ int mpx_disable_management(void) return -ENXIO; down_write(&mm->mmap_sem); - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; up_write(&mm->mmap_sem); return 0; } @@ -796,7 +797,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, return -EINVAL; len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len, NULL); + zap_page_range(vma, addr, len); trace_mpx_unmap_zap(addr, addr+len); vma = vma->vm_next; @@ -893,7 +894,7 @@ static int unmap_entire_bt(struct mm_struct *mm, * avoid recursion, do_munmap() will check whether it comes * from one bounds table through VM_MPX flag. */ - return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL); } static int try_unmap_single_bt(struct mm_struct *mm, @@ -947,7 +948,7 @@ static int try_unmap_single_bt(struct mm_struct *mm, end = bta_end_vaddr; } - bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); ret = get_bt_addr(mm, bde_vaddr, &bt_addr); /* * No bounds table there, so nothing to unmap. diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 3f35b48d1d9d..12dcad7297a5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -19,7 +19,7 @@ #include "numa_internal.h" -int __initdata numa_off; +int numa_off; nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e3353c97d086..28d42130243c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -20,7 +20,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pat.h> @@ -214,7 +214,20 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; +#ifdef CONFIG_PREEMPT + /* + * Avoid wbinvd() because it causes latencies on all CPUs, + * regardless of any CPU isolation that may be in effect. + * + * This should be extended for CAT enabled systems independent of + * PREEMPT because wbinvd() does not respect the CAT partitions and + * this is exposed to unpriviledged users through the graphics + * subsystem. + */ + unsigned long do_wbinvd = 0; +#else unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ +#endif BUG_ON(irqs_disabled()); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 83e701f160a9..efc32bc6862b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -986,20 +986,17 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return 0; } -int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { enum page_cache_mode pcm; if (!pat_enabled()) - return 0; + return; /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); - - return 0; } /* diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 159b52ccd600..d76485b22824 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -47,7 +47,7 @@ static u64 get_subtree_max_end(struct rb_node *node) { u64 ret = 0; if (node) { - struct memtype *data = container_of(node, struct memtype, rb); + struct memtype *data = rb_entry(node, struct memtype, rb); ret = data->subtree_max_end; } return ret; @@ -79,7 +79,7 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, struct memtype *last_lower = NULL; while (node) { - struct memtype *data = container_of(node, struct memtype, rb); + struct memtype *data = rb_entry(node, struct memtype, rb); if (get_subtree_max_end(node->rb_left) > start) { /* Lowest overlap if any must be on left side */ @@ -121,7 +121,7 @@ static struct memtype *memtype_rb_match(struct rb_root *root, node = rb_next(&match->rb); if (node) - match = container_of(node, struct memtype, rb); + match = rb_entry(node, struct memtype, rb); else match = NULL; } @@ -150,7 +150,7 @@ static int memtype_rb_check_conflict(struct rb_root *root, node = rb_next(&match->rb); while (node) { - match = container_of(node, struct memtype, rb); + match = rb_entry(node, struct memtype, rb); if (match->start >= end) /* Checked all possible matches */ goto success; @@ -181,7 +181,7 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct rb_node *parent = NULL; while (*node) { - struct memtype *data = container_of(*node, struct memtype, rb); + struct memtype *data = rb_entry(*node, struct memtype, rb); parent = *node; if (data->subtree_max_end < newdata->end) @@ -270,7 +270,7 @@ int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) } if (node) { /* pos == i */ - struct memtype *this = container_of(node, struct memtype, rb); + struct memtype *this = rb_entry(node, struct memtype, rb); *out = *this; return 0; } else { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3feec5af4e67..6cbdff26bb96 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, return changed; } + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed = !pud_same(*pudp, entry); + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + + if (changed && dirty) { + *pudp = entry; + /* + * We had a write-protection fault here and changed the pud + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) +{ + int ret = 0; + + if (pud_young(*pudp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pudp); + + return ret; +} #endif int ptep_clear_flush_young(struct vm_area_struct *vma, diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index f88ce0e5efd9..2dab69a706ec 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -141,8 +141,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where * we know the FPU regstiers are safe for use and we can use PKRU - * directly. The fact that PKRU is only available when we are - * using eagerfpu mode makes this possible. + * directly. */ void copy_init_pkru_to_fpregs(void) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..32322ce9b405 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ @@ -1067,13 +1067,13 @@ common_load: ilen = prog - temp; if (ilen > BPF_MAX_INSN_SIZE) { - pr_err("bpf_jit_compile fatal insn size error\n"); + pr_err("bpf_jit: fatal insn size error\n"); return -EFAULT; } if (image) { if (unlikely(proglen + ilen > oldproglen)) { - pr_err("bpf_jit_compile fatal error\n"); + pr_err("bpf_jit: fatal error\n"); return -EFAULT; } memcpy(image + proglen, temp, ilen); @@ -1085,10 +1085,6 @@ common_load: return proglen; } -void bpf_jit_compile(struct bpf_prog *prog) -{ -} - struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -1169,9 +1165,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (image) { bpf_flush_icache(header, image + proglen); - set_memory_ro((unsigned long)header, header->pages); + bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; + } else { + prog = orig_prog; } out_addrs: @@ -1182,18 +1180,3 @@ out: tmp : orig_prog); return prog; } - -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!fp->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(fp); -} diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 28c04123b6dd..ffdbc4836b4f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -339,10 +339,11 @@ fail: return 0; } -static void nmi_cpu_setup(void *dummy) +static void nmi_cpu_setup(void) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); raw_spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); @@ -369,7 +370,7 @@ static void nmi_cpu_restore_registers(struct op_msrs *msrs) } } -static void nmi_cpu_shutdown(void *dummy) +static void nmi_cpu_shutdown(void) { unsigned int v; int cpu = smp_processor_id(); @@ -387,20 +388,26 @@ static void nmi_cpu_shutdown(void *dummy) nmi_cpu_restore_registers(msrs); } -static void nmi_cpu_up(void *dummy) +static int nmi_cpu_online(unsigned int cpu) { + local_irq_disable(); if (nmi_enabled) - nmi_cpu_setup(dummy); + nmi_cpu_setup(); if (ctr_running) - nmi_cpu_start(dummy); + nmi_cpu_start(NULL); + local_irq_enable(); + return 0; } -static void nmi_cpu_down(void *dummy) +static int nmi_cpu_down_prep(unsigned int cpu) { + local_irq_disable(); if (ctr_running) - nmi_cpu_stop(dummy); + nmi_cpu_stop(NULL); if (nmi_enabled) - nmi_cpu_shutdown(dummy); + nmi_cpu_shutdown(); + local_irq_enable(); + return 0; } static int nmi_create_files(struct dentry *root) @@ -433,26 +440,7 @@ static int nmi_create_files(struct dentry *root) return 0; } -static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block oprofile_cpu_nb = { - .notifier_call = oprofile_cpu_notifier -}; +static enum cpuhp_state cpuhp_nmi_online; static int nmi_setup(void) { @@ -495,20 +483,17 @@ static int nmi_setup(void) if (err) goto fail; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' */ - get_online_cpus(); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); - on_each_cpu(nmi_cpu_setup, NULL, 1); - __register_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); - + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online", + nmi_cpu_online, nmi_cpu_down_prep); + if (err < 0) + goto fail_nmi; + cpuhp_nmi_online = err; return 0; +fail_nmi: + unregister_nmi_handler(NMI_LOCAL, "oprofile"); fail: free_msrs(); return err; @@ -518,17 +503,9 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ - get_online_cpus(); - on_each_cpu(nmi_cpu_shutdown, NULL, 1); + cpuhp_remove_state(cpuhp_nmi_online); nmi_enabled = 0; ctr_running = 0; - __unregister_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); /* make variables visible to the nmi handler: */ smp_mb(); diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3cd69832d7f4..3961103e9176 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -114,6 +114,16 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=42606 */ + { + .callback = set_nouse_crs, + .ident = "Supermicro X8DTH", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_PRODUCT_NAME, "X8DTH-i/6/iF/6F"), + DMI_MATCH(DMI_BIOS_VERSION, "2.0a"), + }, + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */ { diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index c20d2cc7ef64..ae387e5ee6f7 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -327,35 +327,18 @@ static int __init early_root_info_init(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static int amd_bus_cpu_online(unsigned int cpu) { u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); if (!(reg & ENABLE_CF8_EXT_CFG)) { reg |= ENABLE_CF8_EXT_CFG; wrmsrl(MSR_AMD64_NB_CFG, reg); } + return 0; } -static int amd_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block amd_cpu_notifier = { - .notifier_call = amd_cpu_notify, -}; - static void __init pci_enable_pci_io_ecs(void) { #ifdef CONFIG_AMD_NB @@ -385,7 +368,7 @@ static void __init pci_enable_pci_io_ecs(void) static int __init pci_io_ecs_init(void) { - int cpu; + int ret; /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) @@ -395,12 +378,9 @@ static int __init pci_io_ecs_init(void) if (early_pci_allowed()) pci_enable_pci_io_ecs(); - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, - (void *)(long)cpu); - __register_cpu_notifier(&amd_cpu_notifier); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pci/amd_bus:online", + amd_bus_cpu_online, NULL); + WARN_ON(ret < 0); pci_probe |= PCI_HAS_IO_ECS; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index a4fdfa7dcc1b..0cb52ae0a8f0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -667,7 +667,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) spin_lock(&dma_domain_list_lock); list_for_each_entry(domain, &dma_domain_list, node) { if (pci_domain_nr(pdev->bus) == domain->domain_nr) { - pdev->dev.archdata.dma_ops = domain->dma_ops; + pdev->dev.dma_ops = domain->dma_ops; break; } } diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 052c1cb76305..ec008e800b45 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -179,7 +179,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, } /* We have our own dma_ops: the same as swiotlb but from alloc (above) */ -static struct dma_map_ops sta2x11_dma_ops = { +static const struct dma_map_ops sta2x11_dma_ops = { .alloc = sta2x11_swiotlb_alloc_coherent, .free = x86_swiotlb_free_coherent, .map_page = swiotlb_map_page, @@ -203,7 +203,7 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev) return; pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); - pdev->dev.archdata.dma_ops = &sta2x11_dma_ops; + pdev->dev.dma_ops = &sta2x11_dma_ops; /* We must enable all devices as master, for audio DMA to work */ pci_set_master(pdev); @@ -223,7 +223,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { struct sta2x11_mapping *map; - if (dev->archdata.dma_ops != &sta2x11_dma_ops) { + if (dev->dma_ops != &sta2x11_dma_ops) { if (!dev->dma_mask) return false; return addr + size - 1 <= *dev->dma_mask; @@ -247,7 +247,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) */ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - if (dev->archdata.dma_ops != &sta2x11_dma_ops) + if (dev->dma_ops != &sta2x11_dma_ops) return paddr; return p2a(paddr, to_pci_dev(dev)); } @@ -259,7 +259,7 @@ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) */ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - if (dev->archdata.dma_ops != &sta2x11_dma_ops) + if (dev->dma_ops != &sta2x11_dma_ops) return daddr; return a2p(daddr, to_pci_dev(dev)); } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..e1fb269c87af 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, - "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", + type == PCI_CAP_ID_MSI ? "" : "-X", irq); return irq; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 3c3c19ea94df..184842ef332e 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -8,7 +8,6 @@ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ -obj-y += mellanox/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile index 40983f5b0858..57be88fa34bb 100644 --- a/arch/x86/platform/atom/Makefile +++ b/arch/x86/platform/atom/Makefile @@ -1,2 +1 @@ -obj-$(CONFIG_PMC_ATOM) += pmc_atom.o obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/platform/atom/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c deleted file mode 100644 index 964ff4fc61f9..000000000000 --- a/arch/x86/platform/atom/pmc_atom.c +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Intel Atom SOC Power Management Controller Driver - * Copyright (c) 2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/device.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> -#include <linux/io.h> - -#include <asm/pmc_atom.h> - -struct pmc_bit_map { - const char *name; - u32 bit_mask; -}; - -struct pmc_reg_map { - const struct pmc_bit_map *d3_sts_0; - const struct pmc_bit_map *d3_sts_1; - const struct pmc_bit_map *func_dis; - const struct pmc_bit_map *func_dis_2; - const struct pmc_bit_map *pss; -}; - -struct pmc_dev { - u32 base_addr; - void __iomem *regmap; - const struct pmc_reg_map *map; -#ifdef CONFIG_DEBUG_FS - struct dentry *dbgfs_dir; -#endif /* CONFIG_DEBUG_FS */ - bool init; -}; - -static struct pmc_dev pmc_device; -static u32 acpi_base_addr; - -static const struct pmc_bit_map d3_sts_0_map[] = { - {"LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, - {"LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, - {"LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, - {"LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1}, - {"LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2}, - {"LPSS1_F5_SPI", BIT_LPSS1_F5_SPI}, - {"LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX}, - {"LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX}, - {"SCC_EMMC", BIT_SCC_EMMC}, - {"SCC_SDIO", BIT_SCC_SDIO}, - {"SCC_SDCARD", BIT_SCC_SDCARD}, - {"SCC_MIPI", BIT_SCC_MIPI}, - {"HDA", BIT_HDA}, - {"LPE", BIT_LPE}, - {"OTG", BIT_OTG}, - {"USH", BIT_USH}, - {"GBE", BIT_GBE}, - {"SATA", BIT_SATA}, - {"USB_EHCI", BIT_USB_EHCI}, - {"SEC", BIT_SEC}, - {"PCIE_PORT0", BIT_PCIE_PORT0}, - {"PCIE_PORT1", BIT_PCIE_PORT1}, - {"PCIE_PORT2", BIT_PCIE_PORT2}, - {"PCIE_PORT3", BIT_PCIE_PORT3}, - {"LPSS2_F0_DMA", BIT_LPSS2_F0_DMA}, - {"LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1}, - {"LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2}, - {"LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3}, - {"LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4}, - {"LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5}, - {"LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6}, - {"LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7}, - {}, -}; - -static struct pmc_bit_map byt_d3_sts_1_map[] = { - {"SMB", BIT_SMB}, - {"OTG_SS_PHY", BIT_OTG_SS_PHY}, - {"USH_SS_PHY", BIT_USH_SS_PHY}, - {"DFX", BIT_DFX}, - {}, -}; - -static struct pmc_bit_map cht_d3_sts_1_map[] = { - {"SMB", BIT_SMB}, - {"GMM", BIT_STS_GMM}, - {"ISH", BIT_STS_ISH}, - {}, -}; - -static struct pmc_bit_map cht_func_dis_2_map[] = { - {"SMB", BIT_SMB}, - {"GMM", BIT_FD_GMM}, - {"ISH", BIT_FD_ISH}, - {}, -}; - -static const struct pmc_bit_map byt_pss_map[] = { - {"GBE", PMC_PSS_BIT_GBE}, - {"SATA", PMC_PSS_BIT_SATA}, - {"HDA", PMC_PSS_BIT_HDA}, - {"SEC", PMC_PSS_BIT_SEC}, - {"PCIE", PMC_PSS_BIT_PCIE}, - {"LPSS", PMC_PSS_BIT_LPSS}, - {"LPE", PMC_PSS_BIT_LPE}, - {"DFX", PMC_PSS_BIT_DFX}, - {"USH_CTRL", PMC_PSS_BIT_USH_CTRL}, - {"USH_SUS", PMC_PSS_BIT_USH_SUS}, - {"USH_VCCS", PMC_PSS_BIT_USH_VCCS}, - {"USH_VCCA", PMC_PSS_BIT_USH_VCCA}, - {"OTG_CTRL", PMC_PSS_BIT_OTG_CTRL}, - {"OTG_VCCS", PMC_PSS_BIT_OTG_VCCS}, - {"OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK}, - {"OTG_VCCA", PMC_PSS_BIT_OTG_VCCA}, - {"USB", PMC_PSS_BIT_USB}, - {"USB_SUS", PMC_PSS_BIT_USB_SUS}, - {}, -}; - -static const struct pmc_bit_map cht_pss_map[] = { - {"SATA", PMC_PSS_BIT_SATA}, - {"HDA", PMC_PSS_BIT_HDA}, - {"SEC", PMC_PSS_BIT_SEC}, - {"PCIE", PMC_PSS_BIT_PCIE}, - {"LPSS", PMC_PSS_BIT_LPSS}, - {"LPE", PMC_PSS_BIT_LPE}, - {"UFS", PMC_PSS_BIT_CHT_UFS}, - {"UXD", PMC_PSS_BIT_CHT_UXD}, - {"UXD_FD", PMC_PSS_BIT_CHT_UXD_FD}, - {"UX_ENG", PMC_PSS_BIT_CHT_UX_ENG}, - {"USB_SUS", PMC_PSS_BIT_CHT_USB_SUS}, - {"GMM", PMC_PSS_BIT_CHT_GMM}, - {"ISH", PMC_PSS_BIT_CHT_ISH}, - {"DFX_MASTER", PMC_PSS_BIT_CHT_DFX_MASTER}, - {"DFX_CLUSTER1", PMC_PSS_BIT_CHT_DFX_CLUSTER1}, - {"DFX_CLUSTER2", PMC_PSS_BIT_CHT_DFX_CLUSTER2}, - {"DFX_CLUSTER3", PMC_PSS_BIT_CHT_DFX_CLUSTER3}, - {"DFX_CLUSTER4", PMC_PSS_BIT_CHT_DFX_CLUSTER4}, - {"DFX_CLUSTER5", PMC_PSS_BIT_CHT_DFX_CLUSTER5}, - {}, -}; - -static const struct pmc_reg_map byt_reg_map = { - .d3_sts_0 = d3_sts_0_map, - .d3_sts_1 = byt_d3_sts_1_map, - .func_dis = d3_sts_0_map, - .func_dis_2 = byt_d3_sts_1_map, - .pss = byt_pss_map, -}; - -static const struct pmc_reg_map cht_reg_map = { - .d3_sts_0 = d3_sts_0_map, - .d3_sts_1 = cht_d3_sts_1_map, - .func_dis = d3_sts_0_map, - .func_dis_2 = cht_func_dis_2_map, - .pss = cht_pss_map, -}; - -static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) -{ - return readl(pmc->regmap + reg_offset); -} - -static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val) -{ - writel(val, pmc->regmap + reg_offset); -} - -int pmc_atom_read(int offset, u32 *value) -{ - struct pmc_dev *pmc = &pmc_device; - - if (!pmc->init) - return -ENODEV; - - *value = pmc_reg_read(pmc, offset); - return 0; -} -EXPORT_SYMBOL_GPL(pmc_atom_read); - -int pmc_atom_write(int offset, u32 value) -{ - struct pmc_dev *pmc = &pmc_device; - - if (!pmc->init) - return -ENODEV; - - pmc_reg_write(pmc, offset, value); - return 0; -} -EXPORT_SYMBOL_GPL(pmc_atom_write); - -static void pmc_power_off(void) -{ - u16 pm1_cnt_port; - u32 pm1_cnt_value; - - pr_info("Preparing to enter system sleep state S5\n"); - - pm1_cnt_port = acpi_base_addr + PM1_CNT; - - pm1_cnt_value = inl(pm1_cnt_port); - pm1_cnt_value &= SLEEP_TYPE_MASK; - pm1_cnt_value |= SLEEP_TYPE_S5; - pm1_cnt_value |= SLEEP_ENABLE; - - outl(pm1_cnt_value, pm1_cnt_port); -} - -static void pmc_hw_reg_setup(struct pmc_dev *pmc) -{ - /* - * Disable PMC S0IX_WAKE_EN events coming from: - * - LPC clock run - * - GPIO_SUS ored dedicated IRQs - * - GPIO_SCORE ored dedicated IRQs - * - GPIO_SUS shared IRQ - * - GPIO_SCORE shared IRQ - */ - pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING); -} - -#ifdef CONFIG_DEBUG_FS -static void pmc_dev_state_print(struct seq_file *s, int reg_index, - u32 sts, const struct pmc_bit_map *sts_map, - u32 fd, const struct pmc_bit_map *fd_map) -{ - int offset = PMC_REG_BIT_WIDTH * reg_index; - int index; - - for (index = 0; sts_map[index].name; index++) { - seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n", - offset + index, sts_map[index].name, - fd_map[index].bit_mask & fd ? "Disabled" : "Enabled ", - sts_map[index].bit_mask & sts ? "D3" : "D0"); - } -} - -static int pmc_dev_state_show(struct seq_file *s, void *unused) -{ - struct pmc_dev *pmc = s->private; - const struct pmc_reg_map *m = pmc->map; - u32 func_dis, func_dis_2; - u32 d3_sts_0, d3_sts_1; - - func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS); - func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2); - d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0); - d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1); - - /* Low part */ - pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis); - - /* High part */ - pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2); - - return 0; -} - -static int pmc_dev_state_open(struct inode *inode, struct file *file) -{ - return single_open(file, pmc_dev_state_show, inode->i_private); -} - -static const struct file_operations pmc_dev_state_ops = { - .open = pmc_dev_state_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int pmc_pss_state_show(struct seq_file *s, void *unused) -{ - struct pmc_dev *pmc = s->private; - const struct pmc_bit_map *map = pmc->map->pss; - u32 pss = pmc_reg_read(pmc, PMC_PSS); - int index; - - for (index = 0; map[index].name; index++) { - seq_printf(s, "Island: %-2d - %-32s\tState: %s\n", - index, map[index].name, - map[index].bit_mask & pss ? "Off" : "On"); - } - return 0; -} - -static int pmc_pss_state_open(struct inode *inode, struct file *file) -{ - return single_open(file, pmc_pss_state_show, inode->i_private); -} - -static const struct file_operations pmc_pss_state_ops = { - .open = pmc_pss_state_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int pmc_sleep_tmr_show(struct seq_file *s, void *unused) -{ - struct pmc_dev *pmc = s->private; - u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr; - - s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT; - s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT; - s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT; - s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT; - s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT; - - seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr); - seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr); - seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr); - seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr); - seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr); - return 0; -} - -static int pmc_sleep_tmr_open(struct inode *inode, struct file *file) -{ - return single_open(file, pmc_sleep_tmr_show, inode->i_private); -} - -static const struct file_operations pmc_sleep_tmr_ops = { - .open = pmc_sleep_tmr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void pmc_dbgfs_unregister(struct pmc_dev *pmc) -{ - debugfs_remove_recursive(pmc->dbgfs_dir); -} - -static int pmc_dbgfs_register(struct pmc_dev *pmc) -{ - struct dentry *dir, *f; - - dir = debugfs_create_dir("pmc_atom", NULL); - if (!dir) - return -ENOMEM; - - pmc->dbgfs_dir = dir; - - f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, - dir, pmc, &pmc_dev_state_ops); - if (!f) - goto err; - - f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO, - dir, pmc, &pmc_pss_state_ops); - if (!f) - goto err; - - f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, - dir, pmc, &pmc_sleep_tmr_ops); - if (!f) - goto err; - - return 0; -err: - pmc_dbgfs_unregister(pmc); - return -ENODEV; -} -#else -static int pmc_dbgfs_register(struct pmc_dev *pmc) -{ - return 0; -} -#endif /* CONFIG_DEBUG_FS */ - -static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent) -{ - struct pmc_dev *pmc = &pmc_device; - const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data; - int ret; - - /* Obtain ACPI base address */ - pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr); - acpi_base_addr &= ACPI_BASE_ADDR_MASK; - - /* Install power off function */ - if (acpi_base_addr != 0 && pm_power_off == NULL) - pm_power_off = pmc_power_off; - - pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr); - pmc->base_addr &= PMC_BASE_ADDR_MASK; - - pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN); - if (!pmc->regmap) { - dev_err(&pdev->dev, "error: ioremap failed\n"); - return -ENOMEM; - } - - pmc->map = map; - - /* PMC hardware registers setup */ - pmc_hw_reg_setup(pmc); - - ret = pmc_dbgfs_register(pmc); - if (ret) - dev_warn(&pdev->dev, "debugfs register failed\n"); - - pmc->init = true; - return ret; -} - -/* - * Data for PCI driver interface - * - * used by pci_match_id() call below. - */ -static const struct pci_device_id pmc_pci_ids[] = { - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map }, - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map }, - { 0, }, -}; - -static int __init pmc_atom_init(void) -{ - struct pci_dev *pdev = NULL; - const struct pci_device_id *ent; - - /* We look for our device - PCU PMC - * we assume that there is max. one device. - * - * We can't use plain pci_driver mechanism, - * as the device is really a multiple function device, - * main driver that binds to the pci_device is lpc_ich - * and have to find & bind to the device this way. - */ - for_each_pci_dev(pdev) { - ent = pci_match_id(pmc_pci_ids, pdev); - if (ent) - return pmc_setup_dev(pdev, ent); - } - /* Device not found. */ - return -ENODEV; -} - -device_initcall(pmc_atom_init); - -/* -MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>"); -MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface"); -MODULE_LICENSE("GPL v2"); -*/ diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b27bccd4390f..ce4b06733c09 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -23,11 +23,6 @@ #include <asm/io_apic.h> #include <asm/emergency-restart.h> -static int ce4100_i8042_detect(void) -{ - return 0; -} - /* * The CE4100 platform has an internal 8051 Microcontroller which is * responsible for signaling to the external Power Management Unit the @@ -89,7 +84,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) } static void ce4100_serial_fixup(int port, struct uart_port *up, - unsigned short *capabilites) + u32 *capabilites) { #ifdef CONFIG_EARLY_PRINTK /* @@ -145,7 +140,6 @@ static void sdv_pci_init(void) void __init x86_ce4100_early_setup(void) { x86_init.oem.arch_setup = sdv_arch_setup; - x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; x86_init.mpparse.find_smp_config = x86_init_noop; diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 6aad870e8962..04ca8764f0c0 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -19,8 +19,7 @@ #include <linux/efi.h> #include <linux/efi-bgrt.h> -struct acpi_table_bgrt *bgrt_tab; -void *__initdata bgrt_image; +struct acpi_table_bgrt bgrt_tab; size_t __initdata bgrt_image_size; struct bmp_header { @@ -28,66 +27,58 @@ struct bmp_header { u32 size; } __packed; -void __init efi_bgrt_init(void) +void __init efi_bgrt_init(struct acpi_table_header *table) { - acpi_status status; void *image; struct bmp_header bmp_header; + struct acpi_table_bgrt *bgrt = &bgrt_tab; if (acpi_disabled) return; - status = acpi_get_table("BGRT", 0, - (struct acpi_table_header **)&bgrt_tab); - if (ACPI_FAILURE(status)) - return; - - if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { + if (table->length < sizeof(bgrt_tab)) { pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", - bgrt_tab->header.length, sizeof(*bgrt_tab)); + table->length, sizeof(bgrt_tab)); return; } - if (bgrt_tab->version != 1) { + *bgrt = *(struct acpi_table_bgrt *)table; + if (bgrt->version != 1) { pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n", - bgrt_tab->version); - return; + bgrt->version); + goto out; } - if (bgrt_tab->status & 0xfe) { + if (bgrt->status & 0xfe) { pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n", - bgrt_tab->status); - return; + bgrt->status); + goto out; } - if (bgrt_tab->image_type != 0) { + if (bgrt->image_type != 0) { pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n", - bgrt_tab->image_type); - return; + bgrt->image_type); + goto out; } - if (!bgrt_tab->image_address) { + if (!bgrt->image_address) { pr_notice("Ignoring BGRT: null image address\n"); - return; + goto out; } - image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB); + image = early_memremap(bgrt->image_address, sizeof(bmp_header)); if (!image) { pr_notice("Ignoring BGRT: failed to map image header memory\n"); - return; + goto out; } memcpy(&bmp_header, image, sizeof(bmp_header)); - memunmap(image); + early_memunmap(image, sizeof(bmp_header)); if (bmp_header.id != 0x4d42) { pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", bmp_header.id); - return; + goto out; } bgrt_image_size = bmp_header.size; + efi_mem_reserve(bgrt->image_address, bgrt_image_size); - bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); - if (!bgrt_image) { - pr_notice("Ignoring BGRT: failed to map image memory\n"); - bgrt_image = NULL; - return; - } - - efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size); + return; +out: + memset(bgrt, 0, sizeof(bgrt_tab)); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 936a488d6cf6..565dff3c9a12 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -210,6 +210,70 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } +#define OVERFLOW_ADDR_SHIFT (64 - EFI_PAGE_SHIFT) +#define OVERFLOW_ADDR_MASK (U64_MAX << OVERFLOW_ADDR_SHIFT) +#define U64_HIGH_BIT (~(U64_MAX >> 1)) + +static bool __init efi_memmap_entry_valid(const efi_memory_desc_t *md, int i) +{ + u64 end = (md->num_pages << EFI_PAGE_SHIFT) + md->phys_addr - 1; + u64 end_hi = 0; + char buf[64]; + + if (md->num_pages == 0) { + end = 0; + } else if (md->num_pages > EFI_PAGES_MAX || + EFI_PAGES_MAX - md->num_pages < + (md->phys_addr >> EFI_PAGE_SHIFT)) { + end_hi = (md->num_pages & OVERFLOW_ADDR_MASK) + >> OVERFLOW_ADDR_SHIFT; + + if ((md->phys_addr & U64_HIGH_BIT) && !(end & U64_HIGH_BIT)) + end_hi += 1; + } else { + return true; + } + + pr_warn_once(FW_BUG "Invalid EFI memory map entries:\n"); + + if (end_hi) { + pr_warn("mem%02u: %s range=[0x%016llx-0x%llx%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end_hi, end); + } else { + pr_warn("mem%02u: %s range=[0x%016llx-0x%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end); + } + return false; +} + +static void __init efi_clean_memmap(void) +{ + efi_memory_desc_t *out = efi.memmap.map; + const efi_memory_desc_t *in = out; + const efi_memory_desc_t *end = efi.memmap.map_end; + int i, n_removal; + + for (i = n_removal = 0; in < end; i++) { + if (efi_memmap_entry_valid(in, i)) { + if (out != in) + memcpy(out, in, efi.memmap.desc_size); + out = (void *)out + efi.memmap.desc_size; + } else { + n_removal++; + } + in = (void *)in + efi.memmap.desc_size; + } + + if (n_removal > 0) { + u64 size = efi.memmap.nr_map - n_removal; + + pr_warn("Removing %d invalid memory map entries.\n", n_removal); + efi_memmap_install(efi.memmap.phys_map, size); + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -472,15 +536,12 @@ void __init efi_init(void) } } + efi_clean_memmap(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } -void __init efi_late_init(void) -{ - efi_bgrt_init(); -} - void __init efi_set_executable(efi_memory_desc_t *md, bool executable) { u64 addr, npages; @@ -894,6 +955,11 @@ static void __init __efi_enter_virtual_mode(void) return; } + if (efi_enabled(EFI_DBG)) { + pr_info("EFI runtime memory map:\n"); + efi_print_memmap(); + } + BUG_ON(!efi.systab); if (efi_setup_page_tables(pa, 1 << pg_shift)) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 319148bd4b05..a4695da42d77 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -269,6 +269,22 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) efi_scratch.use_pgd = true; /* + * Certain firmware versions are way too sentimential and still believe + * they are exclusive and unquestionable owners of the first physical page, + * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY + * (but then write-access it later during SetVirtualAddressMap()). + * + * Create a 1:1 mapping for this page, to avoid triple faults during early + * boot with such firmware. We are free to hand this page to the BIOS, + * as trim_bios_range() will reserve the first page and isolate it away + * from memory allocators anyway. + */ + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pr_err("Failed to create 1:1 mapping for the first page!\n"); + return 1; + } + + /* * When making calls to the firmware everything needs to be 1:1 * mapped and addressable with 32-bit pointers. Map the kernel * text and allocate a new stack because we can't rely on the @@ -398,10 +414,44 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len) efi_setup = phys_addr + sizeof(struct setup_data); } -void __init efi_runtime_update_mappings(void) +static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf) { unsigned long pfn; pgd_t *pgd = efi_pgd; + int err1, err2; + + /* Update the 1:1 mapping */ + pfn = md->phys_addr >> PAGE_SHIFT; + err1 = kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf); + if (err1) { + pr_err("Error while updating 1:1 mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + } + + err2 = kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf); + if (err2) { + pr_err("Error while updating VA mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, md->virt_addr); + } + + return err1 || err2; +} + +static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md) +{ + unsigned long pf = 0; + + if (md->attribute & EFI_MEMORY_XP) + pf |= _PAGE_NX; + + if (!(md->attribute & EFI_MEMORY_RO)) + pf |= _PAGE_RW; + + return efi_update_mappings(md, pf); +} + +void __init efi_runtime_update_mappings(void) +{ efi_memory_desc_t *md; if (efi_enabled(EFI_OLD_MEMMAP)) { @@ -410,6 +460,24 @@ void __init efi_runtime_update_mappings(void) return; } + /* + * Use the EFI Memory Attribute Table for mapping permissions if it + * exists, since it is intended to supersede EFI_PROPERTIES_TABLE. + */ + if (efi_enabled(EFI_MEM_ATTR)) { + efi_memattr_apply_permissions(NULL, efi_update_mem_attr); + return; + } + + /* + * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace + * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update + * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not + * published by the firmware. Even if we find a buggy implementation of + * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to + * EFI_PROPERTIES_TABLE, because of the same reason. + */ + if (!efi_enabled(EFI_NX_PE_DATA)) return; @@ -430,15 +498,7 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; - /* Update the 1:1 mapping */ - pfn = md->phys_addr >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf)) - pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", - md->phys_addr, md->virt_addr); - - if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf)) - pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", - md->phys_addr, md->virt_addr); + efi_update_mappings(md, pf); } } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 10aca63a50d7..30031d5293c4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -214,7 +214,7 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Could not allocate boot services memmap\n"); return; @@ -355,7 +355,7 @@ void __init efi_free_boot_services(void) } new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Failed to allocate new EFI memmap\n"); return; diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c index 1693107a518e..0d17c0aafeb1 100644 --- a/arch/x86/platform/goldfish/goldfish.c +++ b/arch/x86/platform/goldfish/goldfish.c @@ -42,10 +42,22 @@ static struct resource goldfish_pdev_bus_resources[] = { } }; +static bool goldfish_enable __initdata; + +static int __init goldfish_setup(char *str) +{ + goldfish_enable = true; + return 0; +} +__setup("goldfish", goldfish_setup); + static int __init goldfish_init(void) { + if (!goldfish_enable) + return -ENODEV; + platform_device_register_simple("goldfish_pdev_bus", -1, - goldfish_pdev_bus_resources, 2); + goldfish_pdev_bus_resources, 2); return 0; } device_initcall(goldfish_init); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index dd6cfa4ad3ac..a7dbec4dce27 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -5,21 +5,19 @@ obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o # WiFi obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o # IPC Devices -obj-y += platform_ipc.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o -obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o -obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o +obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o # I2C GPIO Expanders @@ -28,4 +26,5 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o +obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index 52534ec29765..74283875c7e8 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -32,6 +32,9 @@ static struct gpio_keys_button gpio_button[] = { {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, + {KEY_MUTE, -1, 1, "mute_enable", EV_KEY, 0, 20}, + {KEY_VOLUMEUP, -1, 1, "volume_up", EV_KEY, 0, 20}, + {KEY_VOLUMEDOWN, -1, 1, "volume_down", EV_KEY, 0, 20}, {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c deleted file mode 100644 index a84b73d6c4a0..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * platform_ipc.c: IPC platform library file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/sfi.h> -#include <linux/gpio.h> -#include <asm/intel-mid.h> -#include "platform_ipc.h" - -void __init ipc_device_handler(struct sfi_device_table_entry *pentry, - struct devs_id *dev) -{ - struct platform_device *pdev; - void *pdata = NULL; - static struct resource res __initdata = { - .name = "IRQ", - .flags = IORESOURCE_IRQ, - }; - - pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", - pentry->name, pentry->irq); - - /* - * We need to call platform init of IPC devices to fill misc_pdata - * structure. It will be used in msic_init for initialization. - */ - if (dev != NULL) - pdata = dev->get_platform_data(pentry); - - /* - * On Medfield the platform device creation is handled by the MSIC - * MFD driver so we don't need to do it here. - */ - if (intel_mid_has_msic()) - return; - - pdev = platform_device_alloc(pentry->name, 0); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - pentry->name); - return; - } - res.start = pentry->irq; - platform_device_add_resources(pdev, &res, 1); - - pdev->dev.platform_data = pdata; - intel_scu_device_register(pdev); -} - -static const struct devs_id pmic_audio_dev_id __initconst = { - .name = "pmic_audio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .device_handler = &ipc_device_handler, -}; - -sfi_device(pmic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h deleted file mode 100644 index 79bb09d4f718..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * platform_ipc.h: IPC platform library header file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#ifndef _PLATFORM_IPC_H_ -#define _PLATFORM_IPC_H_ - -void __init -ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev); - -#endif diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c new file mode 100644 index 000000000000..3135416df037 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c @@ -0,0 +1,48 @@ +/* + * Intel Merrifield legacy RTC initialization file + * + * (C) Copyright 2017 Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> + +#include <asm/hw_irq.h> +#include <asm/intel-mid.h> +#include <asm/io_apic.h> +#include <asm/time.h> +#include <asm/x86_init.h> + +static int __init mrfld_legacy_rtc_alloc_irq(void) +{ + struct irq_alloc_info info; + int ret; + + if (!x86_platform.legacy.rtc) + return -ENODEV; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0); + ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info); + if (ret < 0) { + pr_info("Failed to allocate RTC interrupt. Disabling RTC\n"); + x86_platform.legacy.rtc = 0; + return ret; + } + + return 0; +} + +static int __init mrfld_legacy_rtc_init(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + return mrfld_legacy_rtc_alloc_irq(); +} +arch_initcall(mrfld_legacy_rtc_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b399ee..27186ad654c9 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include <linux/err.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/spi/pxa2xx_spi.h> @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 3f1f1c77d090..86edd1e941eb 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -28,9 +28,9 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { - int gsi; struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; + int gsi, irq; if (!pdata) return -EINVAL; @@ -38,10 +38,10 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); - if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { - dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", - gsi); - return -EINVAL; + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + if (irq < 0) { + dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); + return irq; } return 0; @@ -82,4 +82,4 @@ static int __init register_mid_wdt(void) return 0; } -rootfs_initcall(register_mid_wdt); +arch_initcall(register_mid_wdt); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c index cb3490ecb341..d4dc744dd5a5 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void *msic_audio_platform_data(void *info) { @@ -40,8 +39,8 @@ static const struct devs_id msic_audio_dev_id __initconst = { .name = "msic_audio", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_audio_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_audio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c index 4f72193939a6..5c3e9919633f 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c @@ -19,7 +19,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_battery_platform_data(void *info) { @@ -30,8 +29,8 @@ static const struct devs_id msic_battery_dev_id __initconst = { .name = "msic_battery", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_battery_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_battery_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c index 70de5b531ba0..9fdb88d460d7 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_gpio_platform_data(void *info) { @@ -41,8 +40,8 @@ static const struct devs_id msic_gpio_dev_id __initconst = { .name = "msic_gpio", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_gpio_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_gpio_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c index 3d7c2011b6cf..7ae37cdbf256 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c @@ -20,7 +20,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_ocd_platform_data(void *info) { @@ -42,8 +41,8 @@ static const struct devs_id msic_ocd_dev_id __initconst = { .name = "msic_ocd", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_ocd_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_ocd_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c index 038f618fbc52..96809b98cf69 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c @@ -18,7 +18,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_power_btn_platform_data(void *info) { @@ -29,8 +28,8 @@ static const struct devs_id msic_power_btn_dev_id __initconst = { .name = "msic_power_btn", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_power_btn_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c index 114a5755b1e4..3e4167d246cd 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c @@ -19,7 +19,6 @@ #include <asm/intel-mid.h> #include "platform_msic.h" -#include "platform_ipc.h" static void __init *msic_thermal_platform_data(void *info) { @@ -30,8 +29,8 @@ static const struct devs_id msic_thermal_dev_id __initconst = { .name = "msic_thermal", .type = SFI_DEV_TYPE_IPC, .delay = 1, + .msic = 1, .get_platform_data = &msic_thermal_platform_data, - .device_handler = &ipc_device_handler, }; sfi_device(msic_thermal_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c deleted file mode 100644 index e30cb62e3300..000000000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * platform_pmic_gpio.c: PMIC GPIO platform data initialization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/scatterlist.h> -#include <linux/gpio.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/intel_pmic_gpio.h> -#include <asm/intel-mid.h> - -#include "platform_ipc.h" - -static void __init *pmic_gpio_platform_data(void *info) -{ - static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; - int gpio_base = get_gpio_by_name("pmic_gpio_base"); - - if (gpio_base < 0) - gpio_base = 64; - pmic_gpio_pdata.gpio_base = gpio_base; - pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; - pmic_gpio_pdata.gpiointr = 0xffffeff8; - - return &pmic_gpio_pdata; -} - -static const struct devs_id pmic_gpio_spi_dev_id __initconst = { - .name = "pmic_gpio", - .type = SFI_DEV_TYPE_SPI, - .delay = 1, - .get_platform_data = &pmic_gpio_platform_data, -}; - -static const struct devs_id pmic_gpio_ipc_dev_id __initconst = { - .name = "pmic_gpio", - .type = SFI_DEV_TYPE_IPC, - .delay = 1, - .get_platform_data = &pmic_gpio_platform_data, - .device_handler = &ipc_device_handler -}; - -sfi_device(pmic_gpio_spi_dev_id); -sfi_device(pmic_gpio_ipc_dev_id); diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 7850128f0026..12a272582cdc 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -161,12 +161,6 @@ out: regulator_has_full_constraints(); } -/* MID systems don't have i8042 controller */ -static int intel_mid_i8042_detect(void) -{ - return 0; -} - /* * Moorestown does not have external NMI source nor port 0x61 to report * NMI status. The possible NMI sources are from pmu as a result of NMI @@ -197,7 +191,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; - x86_platform.i8042_detect = intel_mid_i8042_detect; x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 1eb47b6298c2..e793fe509971 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void) fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index 59253db41bbc..ae7bdeb0e507 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void) pr_debug("Setting lapic_timer_frequency = %d\n", lapic_timer_frequency); - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } @@ -87,6 +91,7 @@ static unsigned long __init tangier_calibrate_tsc(void) static void __init tangier_arch_setup(void) { x86_platform.calibrate_tsc = tangier_calibrate_tsc; + x86_platform.legacy.rtc = 1; } /* tangier arch ops */ diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 67375dda451c..ef03852ea6e8 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -270,7 +270,6 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) return 0; } -EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev) { diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 051d264fce2e..19b43e3a9f0f 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -15,7 +15,6 @@ #include <linux/interrupt.h> #include <linux/scatterlist.h> #include <linux/sfi.h> -#include <linux/intel_pmic_gpio.h> #include <linux/spi/spi.h> #include <linux/i2c.h> #include <linux/skbuff.h> @@ -226,7 +225,7 @@ int get_gpio_by_name(const char *name) return -EINVAL; } -void __init intel_scu_device_register(struct platform_device *pdev) +static void __init intel_scu_ipc_device_register(struct platform_device *pdev) { if (ipc_next_dev == MAX_IPCDEVS) pr_err("too many SCU IPC devices"); @@ -335,10 +334,22 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", pentry->name, pentry->irq); + + /* + * We need to call platform init of IPC devices to fill misc_pdata + * structure. It will be used in msic_init for initialization. + */ pdata = intel_mid_sfi_get_pdata(dev, pentry); if (IS_ERR(pdata)) return; + /* + * On Medfield the platform device creation is handled by the MSIC + * MFD driver so we don't need to do it here. + */ + if (dev->msic && intel_mid_has_msic()) + return; + pdev = platform_device_alloc(pentry->name, 0); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", @@ -348,7 +359,10 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, install_irq_resource(pdev, pentry->irq); pdev->dev.platform_data = pdata; - platform_device_add(pdev); + if (dev->delay) + intel_scu_ipc_device_register(pdev); + else + platform_device_add(pdev); } static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry, @@ -503,27 +517,23 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) if (!dev) continue; - if (dev->device_handler) { - dev->device_handler(pentry, dev); - } else { - switch (pentry->type) { - case SFI_DEV_TYPE_IPC: - sfi_handle_ipc_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SPI: - sfi_handle_spi_dev(pentry, dev); - break; - case SFI_DEV_TYPE_I2C: - sfi_handle_i2c_dev(pentry, dev); - break; - case SFI_DEV_TYPE_SD: - sfi_handle_sd_dev(pentry, dev); - break; - case SFI_DEV_TYPE_UART: - case SFI_DEV_TYPE_HSI: - default: - break; - } + switch (pentry->type) { + case SFI_DEV_TYPE_IPC: + sfi_handle_ipc_dev(pentry, dev); + break; + case SFI_DEV_TYPE_SPI: + sfi_handle_spi_dev(pentry, dev); + break; + case SFI_DEV_TYPE_I2C: + sfi_handle_i2c_dev(pentry, dev); + break; + case SFI_DEV_TYPE_SD: + sfi_handle_sd_dev(pentry, dev); + break; + case SFI_DEV_TYPE_UART: + case SFI_DEV_TYPE_HSI: + default: + break; } } return 0; diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index f5bad40936ac..b8f562049cad 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -25,7 +25,8 @@ * @fmt: format string. * ... variadic argument list. */ -static void __init imr_self_test_result(int res, const char *fmt, ...) +static __printf(2, 3) +void __init imr_self_test_result(int res, const char *fmt, ...) { va_list vlist; diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile deleted file mode 100644 index f43c93188a1d..000000000000 --- a/arch/x86/platform/mellanox/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c deleted file mode 100644 index 7dcfcca97399..000000000000 --- a/arch/x86/platform/mellanox/mlx-platform.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * arch/x86/platform/mellanox/mlx-platform.c - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/device.h> -#include <linux/dmi.h> -#include <linux/i2c.h> -#include <linux/i2c-mux.h> -#include <linux/module.h> -#include <linux/platform_device.h> -#include <linux/platform_data/i2c-mux-reg.h> - -#define MLX_PLAT_DEVICE_NAME "mlxplat" - -/* LPC bus IO offsets */ -#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000 -#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500 -#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100 -#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb -#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda -#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL -#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) -#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) - -/* Start channel numbers */ -#define MLXPLAT_CPLD_CH1 2 -#define MLXPLAT_CPLD_CH2 10 - -/* Number of LPC attached MUX platform devices */ -#define MLXPLAT_CPLD_LPC_MUX_DEVS 2 - -/* mlxplat_priv - platform private data - * @pdev_i2c - i2c controller platform device - * @pdev_mux - array of mux platform devices - */ -struct mlxplat_priv { - struct platform_device *pdev_i2c; - struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS]; -}; - -/* Regions for LPC I2C controller and LPC base register space */ -static const struct resource mlxplat_lpc_resources[] = { - [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO), - [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_regs", - IORESOURCE_IO), -}; - -/* Platform default channels */ -static const int mlxplat_default_channels[][8] = { - { - MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2, - MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 + - 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7 - }, - { - MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2, - MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 + - 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7 - }, -}; - -/* Platform channels for MSN21xx system family */ -static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; - -/* Platform mux data */ -static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = { - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH1, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1, - .reg_size = 1, - .idle_in_use = 1, - }, - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH2, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2, - .reg_size = 1, - .idle_in_use = 1, - }, - -}; - -static struct platform_device *mlxplat_dev; - -static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_default_channels[i]; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_default_channels[i]); - } - - return 1; -}; - -static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_msn21xx_channels; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_msn21xx_channels); - } - - return 1; -}; - -static struct dmi_system_id mlxplat_dmi_table[] __initdata = { - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSB"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSX"), - }, - }, - { - .callback = mlxplat_dmi_msn21xx_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"), - }, - }, - { } -}; - -static int __init mlxplat_init(void) -{ - struct mlxplat_priv *priv; - int i, err; - - if (!dmi_check_system(mlxplat_dmi_table)) - return -ENODEV; - - mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1, - mlxplat_lpc_resources, - ARRAY_SIZE(mlxplat_lpc_resources)); - - if (IS_ERR(mlxplat_dev)) - return PTR_ERR(mlxplat_dev); - - priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv), - GFP_KERNEL); - if (!priv) { - err = -ENOMEM; - goto fail_alloc; - } - platform_set_drvdata(mlxplat_dev, priv); - - priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1, - NULL, 0); - if (IS_ERR(priv->pdev_i2c)) { - err = PTR_ERR(priv->pdev_i2c); - goto fail_alloc; - }; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - priv->pdev_mux[i] = platform_device_register_resndata( - &mlxplat_dev->dev, - "i2c-mux-reg", i, NULL, - 0, &mlxplat_mux_data[i], - sizeof(mlxplat_mux_data[i])); - if (IS_ERR(priv->pdev_mux[i])) { - err = PTR_ERR(priv->pdev_mux[i]); - goto fail_platform_mux_register; - } - } - - return 0; - -fail_platform_mux_register: - for (i--; i > 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - platform_device_unregister(priv->pdev_i2c); -fail_alloc: - platform_device_unregister(mlxplat_dev); - - return err; -} -module_init(mlxplat_init); - -static void __exit mlxplat_exit(void) -{ - struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev); - int i; - - for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - - platform_device_unregister(priv->pdev_i2c); - platform_device_unregister(mlxplat_dev); -} -module_exit(mlxplat_exit); - -MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)"); -MODULE_DESCRIPTION("Mellanox platform driver"); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:"); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 9e42842e924a..766d4d3529a1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -19,7 +19,6 @@ #include <asm/uv/uv_hub.h> #include <asm/uv/uv_bau.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/tsc.h> #include <asm/irq_vectors.h> #include <asm/timer.h> diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index cd5173a2733f..c34bd8233f7c 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -27,6 +27,7 @@ #include <linux/moduleparam.h> #include <linux/nmi.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/clocksource.h> @@ -45,8 +46,8 @@ * * Handle system-wide NMI events generated by the global 'power nmi' command. * - * Basic operation is to field the NMI interrupt on each cpu and wait - * until all cpus have arrived into the nmi handler. If some cpus do not + * Basic operation is to field the NMI interrupt on each CPU and wait + * until all CPU's have arrived into the nmi handler. If some CPU's do not * make it into the handler, try and force them in with the IPI(NMI) signal. * * We also have to lessen UV Hub MMR accesses as much as possible as this @@ -56,7 +57,7 @@ * To do this we register our primary NMI notifier on the NMI_UNKNOWN * chain. This reduces the number of false NMI calls when the perf * tools are running which generate an enormous number of NMIs per - * second (~4M/s for 1024 cpu threads). Our secondary NMI handler is + * second (~4M/s for 1024 CPU threads). Our secondary NMI handler is * very short as it only checks that if it has been "pinged" with the * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR. * @@ -65,8 +66,20 @@ static struct uv_hub_nmi_s **uv_hub_nmi_list; DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); -EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi); +/* UV hubless values */ +#define NMI_CONTROL_PORT 0x70 +#define NMI_DUMMY_PORT 0x71 +#define PAD_OWN_GPP_D_0 0x2c +#define GPI_NMI_STS_GPP_D_0 0x164 +#define GPI_NMI_ENA_GPP_D_0 0x174 +#define STS_GPP_D_0_MASK 0x1 +#define PAD_CFG_DW0_GPP_D_0 0x4c0 +#define GPIROUTNMI (1ul << 17) +#define PCH_PCR_GPIO_1_BASE 0xfdae0000ul +#define PCH_PCR_GPIO_ADDRESS(offset) (int *)((u64)(pch_base) | (u64)(offset)) + +static u64 *pch_base; static unsigned long nmi_mmr; static unsigned long nmi_mmr_clear; static unsigned long nmi_mmr_pending; @@ -100,7 +113,7 @@ static int param_get_local64(char *buffer, const struct kernel_param *kp) static int param_set_local64(const char *val, const struct kernel_param *kp) { - /* clear on any write */ + /* Clear on any write */ local64_set((local64_t *)kp->arg, 0); return 0; } @@ -144,16 +157,80 @@ module_param_named(wait_count, uv_nmi_wait_count, int, 0644); static int uv_nmi_retry_count = 500; module_param_named(retry_count, uv_nmi_retry_count, int, 0644); -/* - * Valid NMI Actions: - * "dump" - dump process stack for each cpu - * "ips" - dump IP info for each cpu - * "kdump" - do crash dump - * "kdb" - enter KDB (default) - * "kgdb" - enter KGDB - */ -static char uv_nmi_action[8] = "kdb"; -module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644); +static bool uv_pch_intr_enable = true; +static bool uv_pch_intr_now_enabled; +module_param_named(pch_intr_enable, uv_pch_intr_enable, bool, 0644); + +static bool uv_pch_init_enable = true; +module_param_named(pch_init_enable, uv_pch_init_enable, bool, 0644); + +static int uv_nmi_debug; +module_param_named(debug, uv_nmi_debug, int, 0644); + +#define nmi_debug(fmt, ...) \ + do { \ + if (uv_nmi_debug) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +/* Valid NMI Actions */ +#define ACTION_LEN 16 +static struct nmi_action { + char *action; + char *desc; +} valid_acts[] = { + { "kdump", "do kernel crash dump" }, + { "dump", "dump process stack for each cpu" }, + { "ips", "dump Inst Ptr info for each cpu" }, + { "kdb", "enter KDB (needs kgdboc= assignment)" }, + { "kgdb", "enter KGDB (needs gdb target remote)" }, + { "health", "check if CPUs respond to NMI" }, +}; +typedef char action_t[ACTION_LEN]; +static action_t uv_nmi_action = { "dump" }; + +static int param_get_action(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s\n", uv_nmi_action); +} + +static int param_set_action(const char *val, const struct kernel_param *kp) +{ + int i; + int n = ARRAY_SIZE(valid_acts); + char arg[ACTION_LEN], *p; + + /* (remove possible '\n') */ + strncpy(arg, val, ACTION_LEN - 1); + arg[ACTION_LEN - 1] = '\0'; + p = strchr(arg, '\n'); + if (p) + *p = '\0'; + + for (i = 0; i < n; i++) + if (!strcmp(arg, valid_acts[i].action)) + break; + + if (i < n) { + strcpy(uv_nmi_action, arg); + pr_info("UV: New NMI action:%s\n", uv_nmi_action); + return 0; + } + + pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg); + for (i = 0; i < n; i++) + pr_err("UV: %-8s - %s\n", + valid_acts[i].action, valid_acts[i].desc); + return -EINVAL; +} + +static const struct kernel_param_ops param_ops_action = { + .get = param_get_action, + .set = param_set_action, +}; +#define param_check_action(name, p) __param_check(name, p, action_t) + +module_param_named(action, uv_nmi_action, action, 0644); static inline bool uv_nmi_action_is(const char *action) { @@ -192,8 +269,200 @@ static inline void uv_local_mmr_clear_nmi(void) } /* - * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and - * return true. If first cpu in on the system, set global "in_nmi" flag. + * UV hubless NMI handler functions + */ +static inline void uv_reassert_nmi(void) +{ + /* (from arch/x86/include/asm/mach_traps.h) */ + outb(0x8f, NMI_CONTROL_PORT); + inb(NMI_DUMMY_PORT); /* dummy read */ + outb(0x0f, NMI_CONTROL_PORT); + inb(NMI_DUMMY_PORT); /* dummy read */ +} + +static void uv_init_hubless_pch_io(int offset, int mask, int data) +{ + int *addr = PCH_PCR_GPIO_ADDRESS(offset); + int readd = readl(addr); + + if (mask) { /* OR in new data */ + int writed = (readd & ~mask) | data; + + nmi_debug("UV:PCH: %p = %x & %x | %x (%x)\n", + addr, readd, ~mask, data, writed); + writel(writed, addr); + } else if (readd & data) { /* clear status bit */ + nmi_debug("UV:PCH: %p = %x\n", addr, data); + writel(data, addr); + } + + (void)readl(addr); /* flush write data */ +} + +static void uv_nmi_setup_hubless_intr(void) +{ + uv_pch_intr_now_enabled = uv_pch_intr_enable; + + uv_init_hubless_pch_io( + PAD_CFG_DW0_GPP_D_0, GPIROUTNMI, + uv_pch_intr_now_enabled ? GPIROUTNMI : 0); + + nmi_debug("UV:NMI: GPP_D_0 interrupt %s\n", + uv_pch_intr_now_enabled ? "enabled" : "disabled"); +} + +static struct init_nmi { + unsigned int offset; + unsigned int mask; + unsigned int data; +} init_nmi[] = { + { /* HOSTSW_OWN_GPP_D_0 */ + .offset = 0x84, + .mask = 0x1, + .data = 0x0, /* ACPI Mode */ + }, + +/* Clear status: */ + { /* GPI_INT_STS_GPP_D_0 */ + .offset = 0x104, + .mask = 0x0, + .data = 0x1, /* Clear Status */ + }, + { /* GPI_GPE_STS_GPP_D_0 */ + .offset = 0x124, + .mask = 0x0, + .data = 0x1, /* Clear Status */ + }, + { /* GPI_SMI_STS_GPP_D_0 */ + .offset = 0x144, + .mask = 0x0, + .data = 0x1, /* Clear Status */ + }, + { /* GPI_NMI_STS_GPP_D_0 */ + .offset = 0x164, + .mask = 0x0, + .data = 0x1, /* Clear Status */ + }, + +/* Disable interrupts: */ + { /* GPI_INT_EN_GPP_D_0 */ + .offset = 0x114, + .mask = 0x1, + .data = 0x0, /* Disable interrupt generation */ + }, + { /* GPI_GPE_EN_GPP_D_0 */ + .offset = 0x134, + .mask = 0x1, + .data = 0x0, /* Disable interrupt generation */ + }, + { /* GPI_SMI_EN_GPP_D_0 */ + .offset = 0x154, + .mask = 0x1, + .data = 0x0, /* Disable interrupt generation */ + }, + { /* GPI_NMI_EN_GPP_D_0 */ + .offset = 0x174, + .mask = 0x1, + .data = 0x0, /* Disable interrupt generation */ + }, + +/* Setup GPP_D_0 Pad Config: */ + { /* PAD_CFG_DW0_GPP_D_0 */ + .offset = 0x4c0, + .mask = 0xffffffff, + .data = 0x82020100, +/* + * 31:30 Pad Reset Config (PADRSTCFG): = 2h # PLTRST# (default) + * + * 29 RX Pad State Select (RXPADSTSEL): = 0 # Raw RX pad state directly + * from RX buffer (default) + * + * 28 RX Raw Override to '1' (RXRAW1): = 0 # No Override + * + * 26:25 RX Level/Edge Configuration (RXEVCFG): + * = 0h # Level + * = 1h # Edge + * + * 23 RX Invert (RXINV): = 0 # No Inversion (signal active high) + * + * 20 GPIO Input Route IOxAPIC (GPIROUTIOXAPIC): + * = 0 # Routing does not cause peripheral IRQ... + * # (we want an NMI not an IRQ) + * + * 19 GPIO Input Route SCI (GPIROUTSCI): = 0 # Routing does not cause SCI. + * 18 GPIO Input Route SMI (GPIROUTSMI): = 0 # Routing does not cause SMI. + * 17 GPIO Input Route NMI (GPIROUTNMI): = 1 # Routing can cause NMI. + * + * 11:10 Pad Mode (PMODE1/0): = 0h = GPIO control the Pad. + * 9 GPIO RX Disable (GPIORXDIS): + * = 0 # Enable the input buffer (active low enable) + * + * 8 GPIO TX Disable (GPIOTXDIS): + * = 1 # Disable the output buffer; i.e. Hi-Z + * + * 1 GPIO RX State (GPIORXSTATE): This is the current internal RX pad state.. + * 0 GPIO TX State (GPIOTXSTATE): + * = 0 # (Leave at default) + */ + }, + +/* Pad Config DW1 */ + { /* PAD_CFG_DW1_GPP_D_0 */ + .offset = 0x4c4, + .mask = 0x3c00, + .data = 0, /* Termination = none (default) */ + }, +}; + +static void uv_init_hubless_pch_d0(void) +{ + int i, read; + + read = *PCH_PCR_GPIO_ADDRESS(PAD_OWN_GPP_D_0); + if (read != 0) { + pr_info("UV: Hubless NMI already configured\n"); + return; + } + + nmi_debug("UV: Initializing UV Hubless NMI on PCH\n"); + for (i = 0; i < ARRAY_SIZE(init_nmi); i++) { + uv_init_hubless_pch_io(init_nmi[i].offset, + init_nmi[i].mask, + init_nmi[i].data); + } +} + +static int uv_nmi_test_hubless(struct uv_hub_nmi_s *hub_nmi) +{ + int *pstat = PCH_PCR_GPIO_ADDRESS(GPI_NMI_STS_GPP_D_0); + int status = *pstat; + + hub_nmi->nmi_value = status; + atomic_inc(&hub_nmi->read_mmr_count); + + if (!(status & STS_GPP_D_0_MASK)) /* Not a UV external NMI */ + return 0; + + *pstat = STS_GPP_D_0_MASK; /* Is a UV NMI: clear GPP_D_0 status */ + (void)*pstat; /* Flush write */ + + return 1; +} + +static int uv_test_nmi(struct uv_hub_nmi_s *hub_nmi) +{ + if (hub_nmi->hub_present) + return uv_nmi_test_mmr(hub_nmi); + + if (hub_nmi->pch_owner) /* Only PCH owner can check status */ + return uv_nmi_test_hubless(hub_nmi); + + return -1; +} + +/* + * If first CPU in on this hub, set hub_nmi "in_nmi" and "owner" values and + * return true. If first CPU in on the system, set global "in_nmi" flag. */ static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi) { @@ -214,6 +483,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi) { int cpu = smp_processor_id(); int nmi = 0; + int nmi_detected = 0; local64_inc(&uv_nmi_count); this_cpu_inc(uv_cpu_nmi.queries); @@ -224,35 +494,48 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi) break; if (raw_spin_trylock(&hub_nmi->nmi_lock)) { + nmi_detected = uv_test_nmi(hub_nmi); - /* check hub MMR NMI flag */ - if (uv_nmi_test_mmr(hub_nmi)) { + /* Check flag for UV external NMI */ + if (nmi_detected > 0) { uv_set_in_nmi(cpu, hub_nmi); nmi = 1; break; } - /* MMR NMI flag is clear */ + /* A non-PCH node in a hubless system waits for NMI */ + else if (nmi_detected < 0) + goto slave_wait; + + /* MMR/PCH NMI flag is clear */ raw_spin_unlock(&hub_nmi->nmi_lock); } else { - /* wait a moment for the hub nmi locker to set flag */ - cpu_relax(); + + /* Wait a moment for the HUB NMI locker to set flag */ +slave_wait: cpu_relax(); udelay(uv_nmi_slave_delay); - /* re-check hub in_nmi flag */ + /* Re-check hub in_nmi flag */ nmi = atomic_read(&hub_nmi->in_nmi); if (nmi) break; } - /* check if this BMC missed setting the MMR NMI flag */ + /* + * Check if this BMC missed setting the MMR NMI flag (or) + * UV hubless system where only PCH owner can check flag + */ if (!nmi) { nmi = atomic_read(&uv_in_nmi); if (nmi) uv_set_in_nmi(cpu, hub_nmi); } + /* If we're holding the hub lock, release it now */ + if (nmi_detected < 0) + raw_spin_unlock(&hub_nmi->nmi_lock); + } while (0); if (!nmi) @@ -269,12 +552,15 @@ static inline void uv_clear_nmi(int cpu) if (cpu == atomic_read(&hub_nmi->cpu_owner)) { atomic_set(&hub_nmi->cpu_owner, -1); atomic_set(&hub_nmi->in_nmi, 0); - uv_local_mmr_clear_nmi(); + if (hub_nmi->hub_present) + uv_local_mmr_clear_nmi(); + else + uv_reassert_nmi(); raw_spin_unlock(&hub_nmi->nmi_lock); } } -/* Ping non-responding cpus attemping to force them into the NMI handler */ +/* Ping non-responding CPU's attemping to force them into the NMI handler */ static void uv_nmi_nr_cpus_ping(void) { int cpu; @@ -285,7 +571,7 @@ static void uv_nmi_nr_cpus_ping(void) apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); } -/* Clean up flags for cpus that ignored both NMI and ping */ +/* Clean up flags for CPU's that ignored both NMI and ping */ static void uv_nmi_cleanup_mask(void) { int cpu; @@ -297,11 +583,12 @@ static void uv_nmi_cleanup_mask(void) } } -/* Loop waiting as cpus enter nmi handler */ +/* Loop waiting as CPU's enter NMI handler */ static int uv_nmi_wait_cpus(int first) { int i, j, k, n = num_online_cpus(); int last_k = 0, waiting = 0; + int cpu = smp_processor_id(); if (first) { cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask); @@ -310,6 +597,12 @@ static int uv_nmi_wait_cpus(int first) k = n - cpumask_weight(uv_nmi_cpu_mask); } + /* PCH NMI causes only one CPU to respond */ + if (first && uv_pch_intr_now_enabled) { + cpumask_clear_cpu(cpu, uv_nmi_cpu_mask); + return n - k - 1; + } + udelay(uv_nmi_initial_delay); for (i = 0; i < uv_nmi_retry_count; i++) { int loop_delay = uv_nmi_loop_delay; @@ -325,13 +618,13 @@ static int uv_nmi_wait_cpus(int first) k = n; break; } - if (last_k != k) { /* abort if no new cpus coming in */ + if (last_k != k) { /* abort if no new CPU's coming in */ last_k = k; waiting = 0; } else if (++waiting > uv_nmi_wait_count) break; - /* extend delay if waiting only for cpu 0 */ + /* Extend delay if waiting only for CPU 0: */ if (waiting && (n - k) == 1 && cpumask_test_cpu(0, uv_nmi_cpu_mask)) loop_delay *= 100; @@ -342,29 +635,29 @@ static int uv_nmi_wait_cpus(int first) return n - k; } -/* Wait until all slave cpus have entered UV NMI handler */ +/* Wait until all slave CPU's have entered UV NMI handler */ static void uv_nmi_wait(int master) { - /* indicate this cpu is in */ + /* Indicate this CPU is in: */ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_IN); - /* if not the first cpu in (the master), then we are a slave cpu */ + /* If not the first CPU in (the master), then we are a slave CPU */ if (!master) return; do { - /* wait for all other cpus to gather here */ + /* Wait for all other CPU's to gather here */ if (!uv_nmi_wait_cpus(1)) break; - /* if not all made it in, send IPI NMI to them */ - pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n", + /* If not all made it in, send IPI NMI to them */ + pr_alert("UV: Sending NMI IPI to %d CPUs: %*pbl\n", cpumask_weight(uv_nmi_cpu_mask), cpumask_pr_args(uv_nmi_cpu_mask)); uv_nmi_nr_cpus_ping(); - /* if all cpus are in, then done */ + /* If all CPU's are in, then done */ if (!uv_nmi_wait_cpus(0)) break; @@ -387,8 +680,8 @@ static void uv_nmi_dump_cpu_ip_hdr(void) /* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); - printk_address(regs->ip); + pr_info("UV: %4d %6d %-32.32s %pS", + cpu, current->pid, current->comm, (void *)regs->ip); } /* @@ -416,7 +709,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } -/* Trigger a slave cpu to dump it's state */ +/* Trigger a slave CPU to dump it's state */ static void uv_nmi_trigger_dump(int cpu) { int retry = uv_nmi_trigger_delay; @@ -437,7 +730,7 @@ static void uv_nmi_trigger_dump(int cpu) uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP_DONE; } -/* Wait until all cpus ready to exit */ +/* Wait until all CPU's ready to exit */ static void uv_nmi_sync_exit(int master) { atomic_dec(&uv_nmi_cpus_in_nmi); @@ -451,7 +744,23 @@ static void uv_nmi_sync_exit(int master) } } -/* Walk through cpu list and dump state of each */ +/* Current "health" check is to check which CPU's are responsive */ +static void uv_nmi_action_health(int cpu, struct pt_regs *regs, int master) +{ + if (master) { + int in = atomic_read(&uv_nmi_cpus_in_nmi); + int out = num_online_cpus() - in; + + pr_alert("UV: NMI CPU health check (non-responding:%d)\n", out); + atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); + } else { + while (!atomic_read(&uv_nmi_slave_continue)) + cpu_relax(); + } + uv_nmi_sync_exit(master); +} + +/* Walk through CPU list and dump state of each */ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) { if (master) { @@ -538,7 +847,7 @@ static inline int uv_nmi_kdb_reason(void) #else /* !CONFIG_KGDB_KDB */ static inline int uv_nmi_kdb_reason(void) { - /* Insure user is expecting to attach gdb remote */ + /* Ensure user is expecting to attach gdb remote */ if (uv_nmi_action_is("kgdb")) return 0; @@ -563,7 +872,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) if (reason < 0) return; - /* call KGDB NMI handler as MASTER */ + /* Call KGDB NMI handler as MASTER */ ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason, &uv_nmi_slave_continue); if (ret) { @@ -571,7 +880,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); } } else { - /* wait for KGDB signal that it's ready for slaves to enter */ + /* Wait for KGDB signal that it's ready for slaves to enter */ int sig; do { @@ -579,7 +888,7 @@ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) sig = atomic_read(&uv_nmi_slave_continue); } while (!sig); - /* call KGDB as slave */ + /* Call KGDB as slave */ if (sig == SLAVE_CONTINUE) kgdb_nmicallback(cpu, regs); } @@ -623,18 +932,23 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action)); } - /* Pause as all cpus enter the NMI handler */ + /* Pause as all CPU's enter the NMI handler */ uv_nmi_wait(master); - /* Dump state of each cpu */ - if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) + /* Process actions other than "kdump": */ + if (uv_nmi_action_is("health")) { + uv_nmi_action_health(cpu, regs, master); + } else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) { uv_nmi_dump_state(cpu, regs, master); - - /* Call KGDB/KDB if enabled */ - else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) + } else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) { uv_call_kgdb_kdb(cpu, regs, master); + } else { + if (master) + pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action); + uv_nmi_sync_exit(master); + } - /* Clear per_cpu "in nmi" flag */ + /* Clear per_cpu "in_nmi" flag */ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_OUT); /* Clear MMR NMI flag on each hub */ @@ -648,6 +962,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) atomic_set(&uv_nmi_cpu, -1); atomic_set(&uv_in_nmi, 0); atomic_set(&uv_nmi_kexec_failed, 0); + atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR); } uv_nmi_touch_watchdogs(); @@ -657,7 +972,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) } /* - * NMI handler for pulling in CPUs when perf events are grabbing our NMI + * NMI handler for pulling in CPU's when perf events are grabbing our NMI */ static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs) { @@ -690,35 +1005,62 @@ void uv_nmi_init(void) unsigned int value; /* - * Unmask NMI on all cpus + * Unmask NMI on all CPU's */ value = apic_read(APIC_LVT1) | APIC_DM_NMI; value &= ~APIC_LVT_MASKED; apic_write(APIC_LVT1, value); } -void uv_nmi_setup(void) +/* Setup HUB NMI info */ +void __init uv_nmi_setup_common(bool hubbed) { int size = sizeof(void *) * (1 << NODES_SHIFT); - int cpu, nid; + int cpu; - /* Setup hub nmi info */ - uv_nmi_setup_mmrs(); uv_hub_nmi_list = kzalloc(size, GFP_KERNEL); - pr_info("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size); + nmi_debug("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size); BUG_ON(!uv_hub_nmi_list); size = sizeof(struct uv_hub_nmi_s); for_each_present_cpu(cpu) { - nid = cpu_to_node(cpu); + int nid = cpu_to_node(cpu); if (uv_hub_nmi_list[nid] == NULL) { uv_hub_nmi_list[nid] = kzalloc_node(size, GFP_KERNEL, nid); BUG_ON(!uv_hub_nmi_list[nid]); raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock)); atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1); + uv_hub_nmi_list[nid]->hub_present = hubbed; + uv_hub_nmi_list[nid]->pch_owner = (nid == 0); } uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid]; } BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL)); +} + +/* Setup for UV Hub systems */ +void __init uv_nmi_setup(void) +{ + uv_nmi_setup_mmrs(); + uv_nmi_setup_common(true); + uv_register_nmi_notifier(); + pr_info("UV: Hub NMI enabled\n"); +} + +/* Setup for UV Hubless systems */ +void __init uv_nmi_setup_hubless(void) +{ + uv_nmi_setup_common(false); + pch_base = xlate_dev_mem_ptr(PCH_PCR_GPIO_1_BASE); + nmi_debug("UV: PCH base:%p from 0x%lx, GPP_D_0\n", + pch_base, PCH_PCR_GPIO_1_BASE); + if (uv_pch_init_enable) + uv_init_hubless_pch_d0(); + uv_init_hubless_pch_io(GPI_NMI_ENA_GPP_D_0, + STS_GPP_D_0_MASK, STS_GPP_D_0_MASK); + uv_nmi_setup_hubless_intr(); + /* Ensure NMI enabled in Processor Interface Reg: */ + uv_reassert_nmi(); uv_register_nmi_notifier(); + pr_info("UV: Hubless NMI enabled\n"); } diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index b333fc45f9ec..2ee7632d4916 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -30,7 +30,7 @@ #define RTC_NAME "sgi_rtc" -static cycle_t uv_read_rtc(struct clocksource *cs); +static u64 uv_read_rtc(struct clocksource *cs); static int uv_rtc_next_event(unsigned long, struct clock_event_device *); static int uv_rtc_shutdown(struct clock_event_device *evt); @@ -38,7 +38,7 @@ static struct clocksource clocksource_uv = { .name = RTC_NAME, .rating = 299, .read = uv_read_rtc, - .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .mask = (u64)UVH_RTC_REAL_TIME_CLOCK_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -296,7 +296,7 @@ static int uv_rtc_unset_timer(int cpu, int force) * cachelines of it's own page. This allows faster simultaneous reads * from a given socket. */ -static cycle_t uv_read_rtc(struct clocksource *cs) +static u64 uv_read_rtc(struct clocksource *cs) { unsigned long offset; @@ -305,7 +305,7 @@ static cycle_t uv_read_rtc(struct clocksource *cs) else offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); + return (u64)uv_read_local_mmr(UVH_RTC | offset); } /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 53cace2ec0e2..66ade16c7693 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> #include <asm/init.h> #include <asm/proto.h> @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index d957d5f21a86..0bc60a308730 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,6 +1,6 @@ config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB + depends on RAS && X86_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1ac76479c266..8730c2882fff 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -275,6 +275,8 @@ static void do_inject(void) unsigned int cpu = i_mce.extcpu; u8 b = i_mce.bank; + rdtscll(i_mce.tsc); + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 25012abc3409..4463fa72db94 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -69,7 +69,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index ba70ff232917..1972565ab106 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -269,7 +269,8 @@ int main(int argc, char **argv) insns++; } - fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + fprintf((errors) ? stderr : stdout, + "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, (errors) ? "Failure" : "Success", insns, diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 0c2fae8d929d..73eb7fd4aec4 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -992,11 +992,12 @@ static void emit_relocs(int as_text, int use_real_mode) die("Segment relocations found but --realmode not specified\n"); /* Order the relocations for more efficient processing */ - sort_relocs(&relocs16); sort_relocs(&relocs32); #if ELF_BITS == 64 sort_relocs(&relocs32neg); sort_relocs(&relocs64); +#else + sort_relocs(&relocs16); #endif /* Print the relocations */ diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 56f04db0c9c0..ecf31e0358c8 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) fprintf(stderr, "Warning: decoded and checked %d" " instructions with %d warnings\n", insns, warnings); else - fprintf(stderr, "Succeed: decoded and checked %d" + fprintf(stdout, "Success: decoded and checked %d" " instructions\n", insns); return 0; } diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 233ee09c1ce8..c77db2288982 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -26,7 +26,6 @@ static inline void rep_nop(void) } #define cpu_relax() rep_nop() -#define cpu_relax_lowlatency() cpu_relax() #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 60a5a5a85505..2497bac56066 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -5,7 +5,7 @@ #include <linux/mm.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> #include <skas.h> diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index e30202b1716e..a5c9910d234f 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -10,7 +10,7 @@ #include <linux/errno.h> #define __FRAME_OFFSETS #include <asm/ptrace.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> /* diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 49e503697022..727ed442e0a5 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -9,7 +9,7 @@ #include <linux/ptrace.h> #include <linux/kernel.h> #include <asm/unistd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ucontext.h> #include <frame_kern.h> #include <skas.h> diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index e6552275320b..10d907098c26 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -6,6 +6,7 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c index 16ee0e450e3e..f2383484840d 100644 --- a/arch/x86/um/sysrq_32.c +++ b/arch/x86/um/sysrq_32.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/smp.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <asm/ptrace.h> #include <asm/sysrq.h> diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index 38b4e4abd0f8..903ad91b624f 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/utsname.h> #include <asm/current.h> #include <asm/ptrace.h> diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 48e38584d5c1..5bd949da7a4a 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -6,7 +6,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ptrace-abi.h> #include <os.h> #include <skas.h> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c7b15f3e2cf3..76b6dbd627df 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -53,5 +53,5 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" - depends on X86_64 && XEN && XEN_PVHVM + depends on XEN && XEN_PVHVM && ACPI def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index e47e52787d32..cb0164aee156 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_PVH) += xen-pvh.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 44c88ad1841a..bcea81f36fc5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid) static int xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return xen_get_apic_id(xen_apic_read(APIC_ID)); + return cpu_data(cpu).apicid; else return BAD_APICID; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bdd855685403..ec1d5c46e58f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include <xen/interface/memory.h> #include <xen/interface/nmi.h> #include <xen/interface/xen-mca.h> +#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> @@ -176,6 +177,20 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +#ifdef CONFIG_XEN_PVH +/* + * PVH variables. + * + * xen_pvh and pvh_bootparams need to live in data segment since they + * are used after startup_{32|64}, which clear .bss, are invoked. + */ +bool xen_pvh __attribute__((section(".data"))) = 0; +struct boot_params pvh_bootparams __attribute__((section(".data"))); + +struct hvm_start_info pvh_start_info; +unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +#endif + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -980,17 +995,6 @@ static void xen_io_delay(void) { } -static void xen_clts(void) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - - MULTI_fpu_taskswitch(mcs.mc, 0); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - static DEFINE_PER_CPU(unsigned long, xen_cr0_value); static unsigned long xen_read_cr0(void) @@ -1149,10 +1153,11 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); } - /* xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. Note that for - * PVH we want to use native IRQ mechanism. */ - if (have_vcpu_info_placement && !xen_pvh_domain()) { + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (have_vcpu_info_placement) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1233,8 +1238,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = xen_clts, - .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, @@ -1426,49 +1429,9 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. - * - * Note, that it is __ref because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. */ -static void __ref xen_setup_gdt(int cpu) +static void xen_setup_gdt(int cpu) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_X86_64 - unsigned long dummy; - - load_percpu_segment(cpu); /* We need to access per-cpu area */ - switch_to_new_gdt(cpu); /* GDT and GS set */ - - /* We are switching of the Xen provided GDT to our HVM mode - * GDT. The new GDT has __KERNEL_CS with CS.L = 1 - * and we are jumping to reload it. - */ - asm volatile ("pushq %0\n" - "leaq 1f(%%rip),%0\n" - "pushq %0\n" - "lretq\n" - "1:\n" - : "=&r" (dummy) : "0" (__KERNEL_CS)); - - /* - * While not needed, we also set the %es, %ds, and %fs - * to zero. We don't care about %ss as it is NULL. - * Strictly speaking this is not needed as Xen zeros those - * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) - * - * Linux zeros them in cpu_init() and in secondary_startup_64 - * (for BSP). - */ - loadsegment(es, 0); - loadsegment(ds, 0); - loadsegment(fs, 0); -#else - /* PVH: TODO Implement. */ - BUG(); -#endif - return; /* PVH does not need any PV GDT ops. */ - } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1479,59 +1442,6 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } -#ifdef CONFIG_XEN_PVH -/* - * A PV guest starts with default flags that are not set for PVH, set them - * here asap. - */ -static void xen_pvh_set_cr_flags(int cpu) -{ - - /* Some of these are setup in 'secondary_startup_64'. The others: - * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests - * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ - write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); - - if (!cpu) - return; - /* - * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). - */ - if (boot_cpu_has(X86_FEATURE_PSE)) - cr4_set_bits_and_update_boot(X86_CR4_PSE); - - if (boot_cpu_has(X86_FEATURE_PGE)) - cr4_set_bits_and_update_boot(X86_CR4_PGE); -} - -/* - * Note, that it is ref - because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. - */ -void __ref xen_pvh_secondary_vcpu_init(int cpu) -{ - xen_setup_gdt(cpu); - xen_pvh_set_cr_flags(cpu); -} - -static void __init xen_pvh_early_guest_init(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - return; - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_pvh_early_cpu_init(0, false); - xen_pvh_set_cr_flags(0); - -#ifdef CONFIG_X86_32 - BUG(); /* PVH: Implement proper support. */ -#endif -} -#endif /* CONFIG_XEN_PVH */ - static void __init xen_dom0_set_legacy_features(void) { x86_platform.legacy.rtc = 1; @@ -1542,11 +1452,11 @@ static int xen_cpuhp_setup(void) int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "XEN_HVM_GUEST_PREPARE", + "x86/xen/hvm_guest:prepare", xen_cpu_up_prepare, xen_cpu_dead); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "XEN_HVM_GUEST_ONLINE", + "x86/xen/hvm_guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); @@ -1568,24 +1478,17 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); -#ifdef CONFIG_XEN_PVH - xen_pvh_early_guest_init(); -#endif + xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - if (!xen_pvh_domain()) { - pv_cpu_ops = xen_cpu_ops; + pv_cpu_ops = xen_cpu_ops; - x86_platform.get_nmi_reason = xen_get_nmi_reason; - } + x86_platform.get_nmi_reason = xen_get_nmi_reason; - if (xen_feature(XENFEAT_auto_translated_physmap)) - x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; - else - x86_init.resources.memory_setup = xen_memory_setup; + x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; @@ -1678,18 +1581,15 @@ asmlinkage __visible void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* PVH: runs at default kernel iopl of 0 */ - if (!xen_pvh_domain()) { - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - } + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ @@ -1771,6 +1671,102 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } +#ifdef CONFIG_XEN_PVH + +static void xen_pvh_arch_setup(void) +{ +#ifdef CONFIG_ACPI + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (nr_ioapics == 0) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; +#endif +} + +static void __init init_pvh_bootparams(void) +{ + struct xen_memory_map memmap; + unsigned int i; + int rc; + + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + + if (memmap.nr_entries < E820MAX - 1) { + pvh_bootparams.e820_map[memmap.nr_entries].addr = + ISA_START_ADDRESS; + pvh_bootparams.e820_map[memmap.nr_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + pvh_bootparams.e820_map[memmap.nr_entries].type = + E820_RESERVED; + memmap.nr_entries++; + } else + xen_raw_printk("Warning: Can fit ISA range into e820\n"); + + sanitize_e820_map(pvh_bootparams.e820_map, + ARRAY_SIZE(pvh_bootparams.e820_map), + &memmap.nr_entries); + + pvh_bootparams.e820_entries = memmap.nr_entries; + for (i = 0; i < pvh_bootparams.e820_entries; i++) + e820_add_region(pvh_bootparams.e820_map[i].addr, + pvh_bootparams.e820_map[i].size, + pvh_bootparams.e820_map[i].type); + + pvh_bootparams.hdr.cmd_line_ptr = + pvh_start_info.cmdline_paddr; + + /* The first module is always ramdisk. */ + if (pvh_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(pvh_start_info.modlist_paddr); + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; + pvh_bootparams.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later. + */ +void __init xen_prepare_pvh(void) +{ + u32 msr; + u64 pfn; + + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); + BUG(); + } + + xen_pvh = 1; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + init_pvh_bootparams(); + + x86_init.oem.arch_setup = xen_pvh_arch_setup; +} +#endif + void __ref xen_hvm_init_shared_info(void) { int cpu; @@ -1810,20 +1806,29 @@ void __ref xen_hvm_init_shared_info(void) static void __init init_hvm_pv_info(void) { int major, minor; - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; + uint32_t eax, ebx, ecx, edx, base; base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); + eax = cpuid_eax(base + 1); major = eax >> 16; minor = eax & 0xffff; printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - cpuid(base + 2, &pages, &msr, &ecx, &edx); + xen_domain_type = XEN_HVM_DOMAIN; - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (xen_pvh_domain()) + pv_info.name = "Xen PVH"; + else { + u64 pfn; + uint32_t msr; + + pv_info.name = "Xen HVM"; + msr = cpuid_ebx(base + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + } xen_setup_features(); @@ -1832,10 +1837,6 @@ static void __init init_hvm_pv_info(void) this_cpu_write(xen_vcpu_id, ebx); else this_cpu_write(xen_vcpu_id, smp_processor_id()); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; } #endif @@ -1923,6 +1924,9 @@ static void __init xen_hvm_guest_init(void) x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); + + if (xen_pvh_domain()) + machine_ops.emergency_restart = xen_emergency_restart; #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; machine_ops.crash_shutdown = xen_hvm_crash_shutdown; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7d5afdb417cc..37cb5aad71de 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -38,7 +38,7 @@ * * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ -#include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/debugfs.h> #include <linux/bug.h> @@ -1792,10 +1792,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot, unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - /* For PVH no need to set R/O or R/W to pin them or unpin them. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1902,8 +1898,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * level2_ident_pgt, and level2_kernel_pgt. This means that only the * kernel has a physical mapping to start with - but that's enough to * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. NOTE: for - * PVH, the page tables are native. + * mapping once some sort of allocator has been set up. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -2812,16 +2807,6 @@ static int do_remap_gfn(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_XEN_PVH - /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, - prot, domid, pages); -#else - return -EINVAL; -#endif - } - rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contiguous @@ -2915,10 +2900,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; -#ifdef CONFIG_XEN_PVH - return xen_xlate_unmap_gfn_range(vma, numpgs, pages); -#else return -EINVAL; -#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 37129db76d33..276da636dd39 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -71,7 +71,7 @@ #include <asm/cache.h> #include <asm/setup.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/xen/page.h> #include <asm/xen/hypercall.h> diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 0e98e5d241d0..42b08f8fc2ca 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,8 +18,7 @@ int xen_swiotlb __read_mostly; -static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, +static const struct dma_map_ops xen_swiotlb_dma_ops = { .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, @@ -49,7 +48,7 @@ int __init pci_xen_swiotlb_detect(void) * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force)) + if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 90d1b83cf35f..33a783c77d96 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -73,8 +73,8 @@ bool xen_has_pv_devices(void) if (!xen_domain()) return false; - /* PV domains always have them. */ - if (xen_pv_domain()) + /* PV and PVH domains always have them. */ + if (xen_pv_domain() || xen_pvh_domain()) return true; /* And user has xen_platform_pci=0 set in guest config as diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..a8c306cf8868 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820MAX] __initdata; +static struct e820entry xen_e820_map[E820_X_MAX] __initdata; static u32 xen_e820_map_entries __initdata; /* @@ -713,10 +713,9 @@ static void __init xen_reserve_xen_mfnlist(void) size = PFN_PHYS(xen_start_info->nr_p2m_frames); } - if (!xen_is_e820_reserved(start, size)) { - memblock_reserve(start, size); + memblock_reserve(start, size); + if (!xen_is_e820_reserved(start, size)) return; - } #ifdef CONFIG_X86_32 /* @@ -727,6 +726,7 @@ static void __init xen_reserve_xen_mfnlist(void) BUG(); #else xen_relocate_p2m(); + memblock_free(start, size); #endif } @@ -750,7 +750,7 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? @@ -915,39 +915,6 @@ char * __init xen_memory_setup(void) } /* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, xen_e820_map); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_map_entries = memmap.nr_entries; - - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); - - for (i = 0; i < xen_e820_map_entries; i++) - e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, - xen_e820_map[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - -/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. @@ -1032,8 +999,8 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_pvmmu_arch_setup(); + + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 9fa27ceeecfd..7ff2f1bfb7ec 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -18,6 +18,7 @@ #include <linux/smp.h> #include <linux/irq_work.h> #include <linux/tick.h> +#include <linux/nmi.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -87,12 +88,6 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); - /* - * identify_cpu() may have set logical_pkg_id to -1 due - * to incorrect phys_proc_id. Let's re-comupte it. - */ - topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu); - xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); @@ -105,18 +100,8 @@ static void cpu_bringup(void) local_irq_enable(); } -/* - * Note: cpu parameter is only relevant for PVH. The reason for passing it - * is we can't do smp_processor_id until the percpu segments are loaded, for - * which we need the cpu number! So we pass it in rdi as first parameter. - */ -asmlinkage __visible void cpu_bringup_and_idle(int cpu) +asmlinkage __visible void cpu_bringup_and_idle(void) { -#ifdef CONFIG_XEN_PVH - if (xen_feature(XENFEAT_auto_translated_physmap) && - xen_feature(XENFEAT_supervisor_mode_kernel)) - xen_pvh_secondary_vcpu_init(cpu); -#endif cpu_bringup(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } @@ -410,61 +395,47 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); #ifdef CONFIG_X86_32 - /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; - xen_copy_trap_info(ctxt->trap_ctxt); + xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; + ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; #else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - } -#ifdef CONFIG_XEN_PVH - else { - /* - * The vcpu comes on kernel page tables which have the NX pte - * bit set. This means before DS/SS is touched, NX in - * EFER must be set. Hence the following assembly glue code. - */ - ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; - ctxt->user_regs.rdi = cpu; - ctxt->user_regs.rsi = true; /* entry == true */ - } + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c5c16dc4f694..9beef333584a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu) static inline void xen_smp_intr_free(unsigned int cpu) {} #endif /* CONFIG_SMP */ -#ifdef CONFIG_XEN_PVH -extern void xen_pvh_early_cpu_init(int cpu, bool entry); -#else -static inline void xen_pvh_early_cpu_init(int cpu, bool entry) -{ -} -#endif - #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3d6e0064cbfc..25a7c4302ce7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,6 +114,7 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); /* * Our init of PV spinlocks is split in two init functions due to us @@ -137,27 +138,9 @@ void __init xen_init_spinlocks(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = xen_qlock_wait; pv_lock_ops.kick = xen_qlock_kick; + pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } -/* - * While the jump_label init code needs to happend _after_ the jump labels are - * enabled and before SMP is started. Hence we use pre-SMP initcall level - * init. We cannot do it in xen_init_spinlocks as that is done before - * jump labels are activated. - */ -static __init int xen_init_spinlocks_jump(void) -{ - if (!xen_pvspin) - return 0; - - if (!xen_domain()) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - return 0; -} -early_initcall(xen_init_spinlocks_jump); - static __init int xen_parse_nopvspin(char *arg) { xen_pvspin = false; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 33d8f6a7829d..1e69956d7852 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -39,10 +39,10 @@ static unsigned long xen_tsc_khz(void) return pvclock_tsc_khz(info); } -cycle_t xen_clocksource_read(void) +u64 xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; - cycle_t ret; + u64 ret; preempt_disable_notrace(); src = &__this_cpu_read(xen_vcpu)->time; @@ -51,7 +51,7 @@ cycle_t xen_clocksource_read(void) return ret; } -static cycle_t xen_clocksource_get_cycles(struct clocksource *cs) +static u64 xen_clocksource_get_cycles(struct clocksource *cs) { return xen_clocksource_read(); } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f8d8abf4c1a..37794e42b67d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,25 +16,6 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> -#ifdef CONFIG_XEN_PVH -#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" -/* Note the lack of 'hvm_callback_vector'. Older hypervisor will - * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in - * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. - */ -#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_auto_translated_physmap) | \ - (1 << XENFEAT_supervisor_mode_kernel) | \ - (1 << XENFEAT_hvm_callback_vector)) -/* The XENFEAT_writable_page_tables is not stricly necessary as we set that - * up regardless whether this CONFIG option is enabled or not, but it - * clarifies what the right flags need to be. - */ -#else -#define PVH_FEATURES_STR "" -#define PVH_FEATURES (0) -#endif - __INIT ENTRY(startup_xen) cld @@ -54,41 +35,6 @@ ENTRY(startup_xen) __FINIT -#ifdef CONFIG_XEN_PVH -/* - * xen_pvh_early_cpu_init() - early PVH VCPU initialization - * @cpu: this cpu number (%rdi) - * @entry: true if this is a secondary vcpu coming up on this entry - * point, false if this is the boot CPU being initialized for - * the first time (%rsi) - * - * Note: This is called as a function on the boot CPU, and is the entry point - * on the secondary CPU. - */ -ENTRY(xen_pvh_early_cpu_init) - mov %rsi, %r11 - - /* Gather features to see if NX implemented. */ - mov $0x80000001, %eax - cpuid - mov %edx, %esi - - mov $MSR_EFER, %ecx - rdmsr - bts $_EFER_SCE, %eax - - bt $20, %esi - jnc 1f /* No NX, skip setting it */ - bts $_EFER_NX, %eax -1: wrmsr -#ifdef CONFIG_SMP - cmp $0, %r11b - jne cpu_bringup_and_idle -#endif - ret - -#endif /* CONFIG_XEN_PVH */ - .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -114,10 +60,10 @@ ENTRY(hypercall_page) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | - (1 << XENFEAT_writable_page_tables) | - (1 << XENFEAT_dom0)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, + .ascii "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3cbce3b085e7..f6a41c41ebc7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -67,7 +67,7 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); -cycle_t xen_clocksource_read(void); +u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void __init xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); @@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); -void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S new file mode 100644 index 000000000000..5e246716d58f --- /dev/null +++ b/arch/x86/xen/xen-pvh.S @@ -0,0 +1,161 @@ +/* + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + .code32 + .text +#define _pa(x) ((x) - __START_KERNEL_map) + +#include <linux/elfnote.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/asm.h> +#include <asm/boot.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <xen/interface/elfnote.h> + + __HEAD + +/* + * Entry point for PVH guests. + * + * Xen ABI specifies the following register state when we come here: + * + * - `ebx`: contains the physical memory address where the loader has placed + * the boot start info structure. + * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. + * - `cr4`: all bits are cleared. + * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ + * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `ds`, `es`: must be a 32-bit read/write data segment with a base of + * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * unspecified. + * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit + * of '0x67'. + * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared. + * Bit 8 (TF) must be cleared. Other bits are all unspecified. + * + * All other processor registers and flag bits are unspecified. The OS is in + * charge of setting up it's own stack, GDT and IDT. + */ + +ENTRY(pvh_start_xen) + cld + + lgdt (_pa(gdt)) + + mov $(__BOOT_DS),%eax + mov %eax,%ds + mov %eax,%es + mov %eax,%ss + + /* Stash hvm_start_info. */ + mov $_pa(pvh_start_info), %edi + mov %ebx, %esi + mov _pa(pvh_start_info_sz), %ecx + shr $2,%ecx + rep + movsl + + mov $_pa(early_stack_end), %esp + + /* Enable PAE mode. */ + mov %cr4, %eax + orl $X86_CR4_PAE, %eax + mov %eax, %cr4 + +#ifdef CONFIG_X86_64 + /* Enable Long mode. */ + mov $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Enable pre-constructed page tables. */ + mov $_pa(init_level4_pgt), %eax + mov %eax, %cr3 + mov $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + /* Jump to 64-bit mode. */ + ljmp $__KERNEL_CS, $_pa(1f) + + /* 64-bit entry point. */ + .code64 +1: + call xen_prepare_pvh + + /* startup_64 expects boot_params in %rsi. */ + mov $_pa(pvh_bootparams), %rsi + mov $_pa(startup_64), %rax + jmp *%rax + +#else /* CONFIG_X86_64 */ + + call mk_early_pgtbl_32 + + mov $_pa(initial_page_table), %eax + mov %eax, %cr3 + + mov %cr0, %eax + or $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + ljmp $__BOOT_CS, $1f +1: + call xen_prepare_pvh + mov $_pa(pvh_bootparams), %esi + + /* startup_32 doesn't expect paging and PAE to be on. */ + ljmp $__BOOT_CS, $_pa(2f) +2: + mov %cr0, %eax + and $~X86_CR0_PG, %eax + mov %eax, %cr0 + mov %cr4, %eax + and $~X86_CR4_PAE, %eax + mov %eax, %cr4 + + ljmp $__BOOT_CS, $_pa(startup_32) +#endif +END(pvh_start_xen) + + .section ".init.data","aw" + .balign 8 +gdt: + .word gdt_end - gdt_start + .long _pa(gdt_start) + .word 0 +gdt_start: + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* reserved */ +#ifdef CONFIG_X86_64 + .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */ +#else + .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */ +#endif + .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */ +gdt_end: + + .balign 4 +early_stack: + .fill 256, 1, 0 +early_stack_end: + + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, + _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) |