diff options
Diffstat (limited to 'arch/x86')
237 files changed, 8904 insertions, 3286 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f1dbb4ee19d7..b0312f8947ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_UACCESS_MCSAFE if X86_64 + select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX @@ -75,6 +75,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP @@ -180,13 +181,14 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG @@ -345,8 +347,6 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 -source "init/Kconfig" - config CC_HAS_SANE_STACKPROTECTOR bool default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT @@ -355,8 +355,6 @@ config CC_HAS_SANE_STACKPROTECTOR We have to make sure stack protector is unconditionally disabled if the compiler produces broken code. -source "kernel/Kconfig.freezer" - menu "Processor type and features" config ZONE_DMA @@ -1043,8 +1041,6 @@ config SCHED_MC_PRIO If unsure say Y here. -source "kernel/Kconfig.preempt" - config UP_LATE_INIT def_bool y depends on !SMP && X86_LOCAL_APIC @@ -1638,8 +1634,6 @@ config ILLEGAL_POINTER_VALUE default 0 if X86_32 default 0xdead000000000000 if X86_64 -source "mm/Kconfig" - config X86_PMEM_LEGACY_DEVICE bool @@ -2865,9 +2859,7 @@ config X86_SYSFB endmenu -menu "Executable file formats / Emulations" - -source "fs/Kconfig.binfmt" +menu "Binary Emulations" config IA32_EMULATION bool "IA32 Emulation" @@ -2937,20 +2929,6 @@ config X86_DMA_REMAP config HAVE_GENERIC_GUP def_bool y -source "net/Kconfig" - -source "drivers/Kconfig" - source "drivers/firmware/Kconfig" -source "fs/Kconfig" - -source "arch/x86/Kconfig.debug" - -source "security/Kconfig" - -source "crypto/Kconfig" - source "arch/x86/kvm/Kconfig" - -source "lib/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index c6dd1d980081..7d68f0c7cfb1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,11 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -menu "Kernel hacking" config TRACE_IRQFLAGS_SUPPORT def_bool y -source "lib/Kconfig.debug" - config EARLY_PRINTK_USB bool @@ -410,5 +407,3 @@ endchoice config FRAME_POINTER depends on !UNWINDER_ORC && !UNWINDER_GUESS bool - -endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a08e82856563..7e3c07d6ad42 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -80,11 +80,6 @@ ifeq ($(CONFIG_X86_32),y) # alignment instructions. KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) - # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use - # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ - $(call cc-option,-fno-unit-at-a-time)) - # CPU-specific tuning. Anything which can be shared with UML should go here. include arch/x86/Makefile_32.cpu KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 45af19921ebd..5296f8c9e7f0 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -13,8 +13,6 @@ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS += $(call cc-option,-m32) LINK-y += $(call cc-option,-m32) -export LDFLAGS - LDS_EXTRA := -Ui386 export LDS_EXTRA diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 0d41d68131cc..2e1382486e91 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -17,6 +17,7 @@ #define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */ #include <linux/types.h> +#include <asm/asm.h> static inline bool constant_test_bit(int nr, const void *addr) { @@ -28,7 +29,7 @@ static inline bool variable_test_bit(int nr, const void *addr) bool v; const u32 *p = (const u32 *)addr; - asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); + asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr)); return v; } diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..169c2feda14a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,9 +106,13 @@ define cmd_check_data_rel done endef +# We need to run two commands under "if_changed", so merge them into a +# single invocation. +quiet_cmd_check-and-link-vmlinux = LD $@ + cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE - $(call if_changed,check_data_rel) - $(call if_changed,ld) + $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index e57665b4ba1c..1458b1700fc7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -34,74 +34,13 @@ static void setup_boot_services##bits(struct efi_config *c) \ \ table = (typeof(table))sys_table; \ \ - c->runtime_services = table->runtime; \ - c->boot_services = table->boottime; \ - c->text_output = table->con_out; \ + c->runtime_services = table->runtime; \ + c->boot_services = table->boottime; \ + c->text_output = table->con_out; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); -static inline efi_status_t __open_volume32(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_32_t *image = __image; - efi_file_handle_32_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -static inline efi_status_t __open_volume64(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_64_t *image = __image; - efi_file_handle_64_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -efi_status_t -efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) -{ - if (efi_early->is64) - return __open_volume64(__image, __fh); - - return __open_volume32(__image, __fh); -} - void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { efi_call_proto(efi_simple_text_output_protocol, output_string, @@ -109,23 +48,17 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } static efi_status_t -__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) +preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) { struct pci_setup_rom *rom = NULL; efi_status_t status; unsigned long size; - uint64_t attributes, romsize; + uint64_t romsize; void *romimage; - status = efi_call_proto(efi_pci_io_protocol, attributes, pci, - EfiPciIoAttributeOperationGet, 0ULL, - &attributes); - if (status != EFI_SUCCESS) - return status; - /* - * Some firmware images contain EFI function pointers at the place where the - * romimage and romsize fields are supposed to be. Typically the EFI + * Some firmware images contain EFI function pointers at the place where + * the romimage and romsize fields are supposed to be. Typically the EFI * code is mapped at high addresses, translating to an unrealistically * large romsize. The UEFI spec limits the size of option ROMs to 16 * MiB so we reject any ROMs over 16 MiB in size to catch this. @@ -140,16 +73,16 @@ __setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for rom\n"); + efi_printk(sys_table, "Failed to allocate memory for 'rom'\n"); return status; } memset(rom, 0, sizeof(*rom)); - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; + rom->data.type = SETUP_PCI; + rom->data.len = size - sizeof(struct setup_data); + rom->data.next = 0; + rom->pcilen = pci->romsize; *__rom = rom; status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, @@ -185,96 +118,6 @@ free_struct: return status; } -static void -setup_efi_pci32(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_t *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 *handles = (u32 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u32); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u32 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - -static void -setup_efi_pci64(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_t *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u64 *handles = (u64 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u64); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u64 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - /* * There's no way to return an informative status from this function, * because any analysis (and printing of error messages) needs to be @@ -290,6 +133,9 @@ static void setup_efi_pci(struct boot_params *params) void **pci_handle = NULL; efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; unsigned long size = 0; + unsigned long nr_pci; + struct setup_data *data; + int i; status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, @@ -301,7 +147,7 @@ static void setup_efi_pci(struct boot_params *params) size, (void **)&pci_handle); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); + efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n"); return; } @@ -313,10 +159,34 @@ static void setup_efi_pci(struct boot_params *params) if (status != EFI_SUCCESS) goto free_handle; - if (efi_early->is64) - setup_efi_pci64(params, pci_handle, size); - else - setup_efi_pci32(params, pci_handle, size); + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); + for (i = 0; i < nr_pci; i++) { + efi_pci_io_protocol_t *pci = NULL; + struct pci_setup_rom *rom; + + status = efi_call_early(handle_protocol, + efi_is_64bit() ? ((u64 *)pci_handle)[i] + : ((u32 *)pci_handle)[i], + &pci_proto, (void **)&pci); + if (status != EFI_SUCCESS || !pci) + continue; + + status = preserve_pci_rom_image(pci, &rom); + if (status != EFI_SUCCESS) + continue; + + if (data) + data->next = (unsigned long)rom; + else + params->hdr.setup_data = (unsigned long)rom; + + data = (struct setup_data *)rom; + } free_handle: efi_call_early(free_pool, pci_handle); @@ -347,8 +217,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size + sizeof(struct setup_data), &new); if (status != EFI_SUCCESS) { - efi_printk(sys_table, - "Failed to alloc mem for properties\n"); + efi_printk(sys_table, "Failed to allocate memory for 'properties'\n"); return; } @@ -364,9 +233,9 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) new->next = 0; data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; - if (!data) + if (!data) { boot_params->hdr.setup_data = (unsigned long)new; - else { + } else { while (data->next) data = (struct setup_data *)(unsigned long)data->next; data->next = (unsigned long)new; @@ -386,81 +255,55 @@ static void setup_quirks(struct boot_params *boot_params) } } +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ static efi_status_t -setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) +setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size) { - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + efi_uga_draw_protocol_t *uga = NULL, *first_uga; unsigned long nr_ugas; - u32 *handles = (u32 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; int i; - first_uga = NULL; - nr_ugas = size / sizeof(u32); - for (i = 0; i < nr_ugas; i++) { - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 w, h, depth, refresh; - void *pciio; - u32 handle = handles[i]; - - status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); - if (status != EFI_SUCCESS) - continue; - - efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); - if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; - - /* - * Once we've found a UGA supporting PCIIO, - * don't bother looking any further. - */ - if (pciio) - break; - - first_uga = uga; - } - } + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&uga_handle); + if (status != EFI_SUCCESS) + return status; - return status; -} + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + uga_proto, NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; -static efi_status_t -setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) -{ - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; - unsigned long nr_ugas; - u64 *handles = (u64 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; - int i; + height = 0; + width = 0; first_uga = NULL; - nr_ugas = size / sizeof(u64); + nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); for (i = 0; i < nr_ugas; i++) { efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; u32 w, h, depth, refresh; void *pciio; - u64 handle = handles[i]; + unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i] + : ((u32 *)uga_handle)[i]; status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); + uga_proto, (void **)&uga); if (status != EFI_SUCCESS) continue; + pciio = NULL; efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); + status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga, + &w, &h, &depth, &refresh); if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; + width = w; + height = h; /* * Once we've found a UGA supporting PCIIO, @@ -473,59 +316,28 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) } } - return status; -} - -/* - * See if we have Universal Graphics Adapter (UGA) protocol - */ -static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, - unsigned long size) -{ - efi_status_t status; - u32 width, height; - void **uga_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&uga_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - uga_proto, NULL, &size, uga_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - height = 0; - width = 0; - - if (efi_early->is64) - status = setup_uga64(uga_handle, size, &width, &height); - else - status = setup_uga32(uga_handle, size, &width, &height); - if (!width && !height) goto free_handle; /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; + si->orig_video_isVGA = VIDEO_TYPE_EFI; - si->lfb_depth = 32; - si->lfb_width = width; - si->lfb_height = height; + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; free_handle: efi_call_early(free_pool, uga_handle); + return status; } @@ -592,7 +404,7 @@ struct boot_params *make_boot_params(struct efi_config *c) if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) return NULL; - if (efi_early->is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); @@ -607,7 +419,7 @@ struct boot_params *make_boot_params(struct efi_config *c) status = efi_low_alloc(sys_table, 0x4000, 1, (unsigned long *)&boot_params); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc lowmem for boot params\n"); + efi_printk(sys_table, "Failed to allocate lowmem for boot params\n"); return NULL; } @@ -623,9 +435,9 @@ struct boot_params *make_boot_params(struct efi_config *c) * Fill out some of the header fields ourselves because the * EFI firmware loader doesn't load the first sector. */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; + hdr->boot_flag = 0xAA55; hdr->type_of_loader = 0x21; @@ -633,6 +445,7 @@ struct boot_params *make_boot_params(struct efi_config *c) cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size); if (!cmdline_ptr) goto fail; + hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; /* Fill in upper bits of command line address, NOP on 32 bit */ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; @@ -669,10 +482,12 @@ struct boot_params *make_boot_params(struct efi_config *c) boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; return boot_params; + fail2: efi_free(sys_table, options_size, hdr->cmd_line_ptr); fail: efi_free(sys_table, 0x4000, (unsigned long)boot_params); + return NULL; } @@ -684,7 +499,7 @@ static void add_e820ext(struct boot_params *params, unsigned long size; e820ext->type = SETUP_E820_EXT; - e820ext->len = nr_entries * sizeof(struct boot_e820_entry); + e820ext->len = nr_entries * sizeof(struct boot_e820_entry); e820ext->next = 0; data = (struct setup_data *)(unsigned long)params->hdr.setup_data; @@ -698,8 +513,8 @@ static void add_e820ext(struct boot_params *params, params->hdr.setup_data = (unsigned long)e820ext; } -static efi_status_t setup_e820(struct boot_params *params, - struct setup_data *e820ext, u32 e820ext_size) +static efi_status_t +setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size) { struct boot_e820_entry *entry = params->e820_table; struct efi_info *efi = ¶ms->efi_info; @@ -820,11 +635,10 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, } struct exit_boot_struct { - struct boot_params *boot_params; - struct efi_info *efi; - struct setup_data *e820ext; - __u32 e820ext_size; - bool is64; + struct boot_params *boot_params; + struct efi_info *efi; + struct setup_data *e820ext; + __u32 e820ext_size; }; static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, @@ -851,25 +665,25 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, first = false; } - signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE + : EFI32_LOADER_SIGNATURE; memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - p->efi->efi_systab = (unsigned long)sys_table_arg; - p->efi->efi_memdesc_size = *map->desc_size; - p->efi->efi_memdesc_version = *map->desc_ver; - p->efi->efi_memmap = (unsigned long)*map->map; - p->efi->efi_memmap_size = *map->map_size; + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; - p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif return EFI_SUCCESS; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle, bool is64) +static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) { unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; @@ -880,17 +694,16 @@ static efi_status_t exit_boot(struct boot_params *boot_params, struct efi_boot_memmap map; struct exit_boot_struct priv; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - priv.boot_params = boot_params; - priv.efi = &boot_params->efi_info; - priv.e820ext = NULL; - priv.e820ext_size = 0; - priv.is64 = is64; + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; /* Might as well exit boot services now */ status = efi_exit_boot_services(sys_table, handle, &map, &priv, @@ -898,10 +711,11 @@ static efi_status_t exit_boot(struct boot_params *boot_params, if (status != EFI_SUCCESS) return status; - e820ext = priv.e820ext; - e820ext_size = priv.e820ext_size; + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; + /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; + boot_params->alt_mem_k = 32 * 1024; status = setup_e820(boot_params, e820ext, e820ext_size); if (status != EFI_SUCCESS) @@ -914,8 +728,8 @@ static efi_status_t exit_boot(struct boot_params *boot_params, * On success we return a pointer to a boot_params structure, and NULL * on failure. */ -struct boot_params *efi_main(struct efi_config *c, - struct boot_params *boot_params) +struct boot_params * +efi_main(struct efi_config *c, struct boot_params *boot_params) { struct desc_ptr *gdt = NULL; efi_loaded_image_t *image; @@ -924,13 +738,11 @@ struct boot_params *efi_main(struct efi_config *c, struct desc_struct *desc; void *handle; efi_system_table_t *_table; - bool is64; efi_early = c; _table = (efi_system_table_t *)(unsigned long)efi_early->table; handle = (void *)(unsigned long)efi_early->image_handle; - is64 = efi_early->is64; sys_table = _table; @@ -938,7 +750,7 @@ struct boot_params *efi_main(struct efi_config *c, if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) goto fail; - if (is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); @@ -963,7 +775,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt structure\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n"); goto fail; } @@ -971,7 +783,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_low_alloc(sys_table, gdt->size, 8, (unsigned long *)&gdt->address); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n"); goto fail; } @@ -994,7 +806,7 @@ struct boot_params *efi_main(struct efi_config *c, hdr->code32_start = bzimage_addr; } - status = exit_boot(boot_params, handle, is64); + status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { efi_printk(sys_table, "exit_boot() failed!\n"); goto fail; @@ -1008,19 +820,20 @@ struct boot_params *efi_main(struct efi_config *c, if (IS_ENABLED(CONFIG_X86_64)) { /* __KERNEL32_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; } else { /* Second entry is unused on 32-bit */ @@ -1028,15 +841,16 @@ struct boot_params *efi_main(struct efi_config *c, } /* __KERNEL_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; desc->d = 0; @@ -1044,41 +858,41 @@ struct boot_params *efi_main(struct efi_config *c, desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; } - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; /* __KERNEL_DS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; if (IS_ENABLED(CONFIG_X86_64)) { /* Task segment value */ - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; } @@ -1088,5 +902,6 @@ struct boot_params *efi_main(struct efi_config *c, return boot_params; fail: efi_printk(sys_table, "efi_main() failed!\n"); + return NULL; } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index e799dc5c6448..8297387c4676 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -12,22 +12,22 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -struct efi_uga_draw_protocol_32 { +typedef struct { u32 get_mode; u32 set_mode; u32 blt; -}; +} efi_uga_draw_protocol_32_t; -struct efi_uga_draw_protocol_64 { +typedef struct { u64 get_mode; u64 set_mode; u64 blt; -}; +} efi_uga_draw_protocol_64_t; -struct efi_uga_draw_protocol { +typedef struct { void *get_mode; void *set_mode; void *blt; -}; +} efi_uga_draw_protocol_t; #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b87a7582853d..302517929932 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -102,7 +102,7 @@ static bool memmap_too_large; /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -unsigned long long mem_limit = ULLONG_MAX; +static unsigned long long mem_limit = ULLONG_MAX; enum mem_avoid_index { @@ -215,7 +215,36 @@ static void mem_avoid_memmap(char *str) memmap_too_large = true; } -static int handle_mem_memmap(void) +/* Store the number of 1GB huge pages which users specified: */ +static unsigned long max_gb_huge_pages; + +static void parse_gb_huge_pages(char *param, char *val) +{ + static bool gbpage_sz; + char *p; + + if (!strcmp(param, "hugepagesz")) { + p = val; + if (memparse(p, &p) != PUD_SIZE) { + gbpage_sz = false; + return; + } + + if (gbpage_sz) + warn("Repeatedly set hugeTLB page size of 1G!\n"); + gbpage_sz = true; + return; + } + + if (!strcmp(param, "hugepages") && gbpage_sz) { + p = val; + max_gb_huge_pages = simple_strtoull(p, &p, 0); + return; + } +} + + +static int handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); size_t len = strlen((char *)args); @@ -223,7 +252,8 @@ static int handle_mem_memmap(void) char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=")) + if (!strstr(args, "memmap=") && !strstr(args, "mem=") && + !strstr(args, "hugepages")) return 0; tmp_cmdline = malloc(len + 1); @@ -248,6 +278,8 @@ static int handle_mem_memmap(void) if (!strcmp(param, "memmap")) { mem_avoid_memmap(val); + } else if (strstr(param, "hugepages")) { + parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; @@ -387,7 +419,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - handle_mem_memmap(); + handle_mem_options(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -466,6 +498,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } +/* + * Skip as many 1GB huge pages as possible in the passed region + * according to the number which users specified: + */ +static void +process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) +{ + unsigned long addr, size = 0; + struct mem_vector tmp; + int i = 0; + + if (!max_gb_huge_pages) { + store_slot_info(region, image_size); + return; + } + + addr = ALIGN(region->start, PUD_SIZE); + /* Did we raise the address above the passed in memory entry? */ + if (addr < region->start + region->size) + size = region->size - (addr - region->start); + + /* Check how many 1GB huge pages can be filtered out: */ + while (size > PUD_SIZE && max_gb_huge_pages) { + size -= PUD_SIZE; + max_gb_huge_pages--; + i++; + } + + /* No good 1GB huge pages found: */ + if (!i) { + store_slot_info(region, image_size); + return; + } + + /* + * Skip those 'i'*1GB good huge pages, and continue checking and + * processing the remaining head or tail part of the passed region + * if available. + */ + + if (addr >= region->start + image_size) { + tmp.start = region->start; + tmp.size = addr - region->start; + store_slot_info(&tmp, image_size); + } + + size = region->size - (addr - region->start) - i * PUD_SIZE; + if (size >= image_size) { + tmp.start = addr + i * PUD_SIZE; + tmp.size = size; + store_slot_info(&tmp, image_size); + } +} + static unsigned long slots_fetch_random(void) { unsigned long slot; @@ -546,7 +632,7 @@ static void process_mem_region(struct mem_vector *entry, /* If nothing overlaps, store the region and return. */ if (!mem_avoid_overlap(®ion, &overlap)) { - store_slot_info(®ion, image_size); + process_gb_huge_pages(®ion, image_size); return; } @@ -556,7 +642,7 @@ static void process_mem_region(struct mem_vector *entry, beginning.start = region.start; beginning.size = overlap.start - region.start; - store_slot_info(&beginning, image_size); + process_gb_huge_pages(&beginning, image_size); } /* Return if overlap extends to or past end of region. */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 8c5107545251..9e2157371491 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,3 +1,4 @@ +#include <asm/e820/types.h> #include <asm/processor.h> #include "pgtable.h" #include "../string.h" @@ -34,10 +35,62 @@ unsigned long *trampoline_32bit __section(.data); extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); +static unsigned long find_trampoline_placement(void) +{ + unsigned long bios_start, ebda_start; + unsigned long trampoline_start; + struct boot_e820_entry *entry; + int i; + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Find the first usable memory region under bios_start. */ + for (i = boot_params->e820_entries - 1; i >= 0; i--) { + entry = &boot_params->e820_table[i]; + + /* Skip all entries above bios_start. */ + if (bios_start <= entry->addr) + continue; + + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + + /* Adjust bios_start to the end of the entry if needed. */ + if (bios_start > entry->addr + entry->size) + bios_start = entry->addr + entry->size; + + /* Keep bios_start page-aligned. */ + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Skip the entry if it's too small. */ + if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + continue; + + break; + } + + /* Place the trampoline just below the end of low memory */ + return bios_start - TRAMPOLINE_32BIT_SIZE; +} + struct paging_config paging_prepare(void *rmode) { struct paging_config paging_config = {}; - unsigned long bios_start, ebda_start; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ boot_params = rmode; @@ -61,23 +114,7 @@ struct paging_config paging_prepare(void *rmode) paging_config.l5_required = 1; } - /* - * Find a suitable spot for the trampoline. - * This code is based on reserve_bios_regions(). - */ - - ebda_start = *(unsigned short *)0x40e << 4; - bios_start = *(unsigned short *)0x413 << 10; - - if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) - bios_start = BIOS_START_MAX; - - if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) - bios_start = ebda_start; - - /* Place the trampoline just below the end of low memory, aligned to 4k */ - paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; - paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + paging_config.trampoline_start = find_trampoline_placement(); trampoline_32bit = (unsigned long *)paging_config.trampoline_start; diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 16f49123d747..c4428a176973 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -13,6 +13,7 @@ */ #include <linux/types.h> +#include <asm/asm.h> #include "ctype.h" #include "string.h" @@ -28,8 +29,8 @@ int memcmp(const void *s1, const void *s2, size_t len) { bool diff; - asm("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm("repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 9254e0b6cc06..5f7e43d4f64a 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -75,7 +75,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov LEN, %r8 @@ -535,6 +535,7 @@ ENTRY(crypto_aegis128_aesni_enc_tail) movdqu STATE3, 0x40(STATEP) FRAME_END + ret ENDPROC(crypto_aegis128_aesni_enc_tail) .macro decrypt_block a s0 s1 s2 s3 s4 i diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 5de7c0d46edf..acd11b3bf639 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128_aesni_alg, diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S index 9263c344f2c7..491dd61c845c 100644 --- a/arch/x86/crypto/aegis128l-aesni-asm.S +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -66,7 +66,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG0, MSG0 pxor MSG1, MSG1 @@ -645,6 +645,7 @@ ENTRY(crypto_aegis128l_aesni_enc_tail) state_store0 FRAME_END + ret ENDPROC(crypto_aegis128l_aesni_enc_tail) /* diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 876e4866e633..2071c3d1ae07 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128l_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128l_aesni_alg, diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S index 1d977d515bf9..8870c7c5d9a4 100644 --- a/arch/x86/crypto/aegis256-aesni-asm.S +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -59,7 +59,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov LEN, %r8 @@ -543,6 +543,7 @@ ENTRY(crypto_aegis256_aesni_enc_tail) state_store0 FRAME_END + ret ENDPROC(crypto_aegis256_aesni_enc_tail) /* diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 2b5dd3af8f4d..b5f2a8fd5a71 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis256_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis256_aesni_alg, diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e762ef417562..9bd139569b41 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff .macro GCM_INIT Iv SUBKEY AAD AADLEN mov \AADLEN, %r11 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11, %r11 + xor %r11d, %r11d mov %r11, InLen(%arg2) # ctx_data.in_length = 0 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 @@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu HashKey(%arg2), %xmm13 add %arg5, InLen(%arg2) - xor %r11, %r11 # initialise the data pointer offset as zero + xor %r11d, %r11d # initialise the data pointer offset as zero PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation sub %r11, %arg5 # sub partial block data used @@ -702,7 +702,7 @@ _no_extra_mask_1_\@: # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %rax,%rax + xor %eax, %eax mov %rax, PBlockLen(%arg2) jmp _dec_done_\@ @@ -737,7 +737,7 @@ _no_extra_mask_2_\@: # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %rax,%rax + xor %eax, %eax mov %rax, PBlockLen(%arg2) jmp _encode_done_\@ diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index faecb1518bf8..1985ea0b551b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -463,7 +463,7 @@ _get_AAD_rest_final\@: _get_AAD_done\@: # initialize the data pointer offset as zero - xor %r11, %r11 + xor %r11d, %r11d # start AES for num_initial_blocks blocks mov arg5, %rax # rax = *Y0 @@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@: _get_AAD_done\@: # initialize the data pointer offset as zero - xor %r11, %r11 + xor %r11d, %r11d # start AES for num_initial_blocks blocks mov arg5, %rax # rax = *Y0 diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 2ddbe3a1868b..3582ae885ee1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -154,8 +154,7 @@ static struct shash_alg ghash_alg = { .cra_name = "__ghash", .cra_driver_name = "__ghash-pclmulqdqni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_SHASH | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, @@ -315,9 +314,8 @@ static struct ahash_alg ghash_async_alg = { .cra_driver_name = "ghash-clmulni", .cra_priority = 400, .cra_ctxsize = sizeof(struct ghash_async_ctx), - .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_init = ghash_async_init_tfm, .cra_exit = ghash_async_exit_tfm, diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S index 37d422e77931..de182c460f82 100644 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d vpxor MSG, MSG, MSG mov %rcx, %r8 @@ -453,6 +453,7 @@ ENTRY(crypto_morus1280_avx2_enc_tail) vmovdqu STATE4, (4 * 32)(%rdi) FRAME_END + ret ENDPROC(crypto_morus1280_avx2_enc_tail) /* diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index f111f36d26dc..6634907d6ccd 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); -static const struct x86_cpu_id avx2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AVX2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); - static int __init crypto_morus1280_avx2_module_init(void) { - if (!x86_match_cpu(avx2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_avx2_algs, diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S index 1fe637c7be9d..da5d2905db60 100644 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG_LO, MSG_LO pxor MSG_HI, MSG_HI @@ -652,6 +652,7 @@ ENTRY(crypto_morus1280_sse2_enc_tail) movdqu STATE4_HI, (9 * 16)(%rdi) FRAME_END + ret ENDPROC(crypto_morus1280_sse2_enc_tail) /* diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 839270aa713c..95cf857d2cbb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus1280_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_sse2_algs, diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S index 71c72a0a0862..414db480250e 100644 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov %rcx, %r8 @@ -437,6 +437,7 @@ ENTRY(crypto_morus640_sse2_enc_tail) movdqu STATE4, (4 * 16)(%rdi) FRAME_END + ret ENDPROC(crypto_morus640_sse2_enc_tail) /* diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 26b47e2db8d2..615fb7bc9a32 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus640_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus640_sse2_algs, diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 790377797544..f012b7e28ad1 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -169,7 +169,6 @@ static struct shash_alg alg = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = POLY1305_BLOCK_SIZE, .cra_module = THIS_MODULE, }, diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index e17655ffde79..b93805664c1d 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -746,9 +746,8 @@ static struct ahash_alg sha1_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -871,10 +870,16 @@ static struct ahash_alg sha1_mb_async_alg = { .base = { .cra_name = "sha1", .cra_driver_name = "sha1_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list), .cra_init = sha1_mb_async_init_tfm, diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 6204bd53528c..613d0bfc3d84 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -96,7 +96,7 @@ # cleanup workspace mov $8, %ecx mov %rsp, %rdi - xor %rax, %rax + xor %eax, %eax rep stosq mov %rbp, %rsp # deallocate workspace diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index fc61739150e7..7391c7de72c7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -104,7 +104,6 @@ static struct shash_alg sha1_ssse3_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -157,7 +156,6 @@ static struct shash_alg sha1_avx_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -249,7 +247,6 @@ static struct shash_alg sha1_avx2_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -307,7 +304,6 @@ static struct shash_alg sha1_ni_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 4c46ac1b6653..97c5fc43e115 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -745,9 +745,8 @@ static struct ahash_alg sha256_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -870,11 +869,16 @@ static struct ahash_alg sha256_mb_async_alg = { .base = { .cra_name = "sha256", .cra_driver_name = "sha256_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT (sha256_mb_async_alg.halg.base.cra_list), diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index 16c4ccb1f154..d2364c55bbde 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -265,7 +265,7 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 - vmovd _args_digest(state , idx, 4) , %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 9e79baf03a4b..773a873d2b28 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -109,7 +109,6 @@ static struct shash_alg sha256_ssse3_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -124,7 +123,6 @@ static struct shash_alg sha256_ssse3_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -177,7 +175,6 @@ static struct shash_alg sha256_avx_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -192,7 +189,6 @@ static struct shash_alg sha256_avx_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -261,7 +257,6 @@ static struct shash_alg sha256_avx2_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -276,7 +271,6 @@ static struct shash_alg sha256_avx2_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -343,7 +337,6 @@ static struct shash_alg sha256_ni_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -358,7 +351,6 @@ static struct shash_alg sha256_ni_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 39e2bbdc1836..26b85678012d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -778,9 +778,8 @@ static struct ahash_alg sha512_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -904,11 +903,16 @@ static struct ahash_alg sha512_mb_async_alg = { .base = { .cra_name = "sha512", .cra_driver_name = "sha512_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT (sha512_mb_async_alg.halg.base.cra_list), diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 2b0e2a6825f3..f1b811b60ba6 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -109,7 +109,6 @@ static struct shash_alg sha512_ssse3_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -124,7 +123,6 @@ static struct shash_alg sha512_ssse3_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -188,7 +186,6 @@ static struct shash_alg sha512_avx_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -203,7 +200,6 @@ static struct shash_alg sha512_avx_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -261,7 +257,6 @@ static struct shash_alg sha512_avx2_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -276,7 +271,6 @@ static struct shash_alg sha512_avx2_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c371bfee137a..2767c625a52c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -65,7 +65,7 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all +# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -77,6 +77,8 @@ #endif .endm +#define PTI_SWITCH_MASK (1 << PAGE_SHIFT) + /* * User gs save/restore * @@ -154,7 +156,52 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL pt_regs_ax=%eax +/* Unconditionally switch to user cr3 */ +.macro SWITCH_TO_USER_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + movl %cr3, \scratch_reg + orl $PTI_SWITCH_MASK, \scratch_reg + movl \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro BUG_IF_WRONG_CR3 no_user_check=0 +#ifdef CONFIG_DEBUG_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + .if \no_user_check == 0 + /* coming from usermode? */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lend_\@ + .endif + /* On user-cr3? */ + movl %cr3, %eax + testl $PTI_SWITCH_MASK, %eax + jnz .Lend_\@ + /* From userspace with kernel cr3 - BUG */ + ud2 +.Lend_\@: +#endif +.endm + +/* + * Switch to kernel cr3 if not already loaded and return current cr3 in + * \scratch_reg + */ +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + movl %cr3, \scratch_reg + /* Test if we are already on kernel CR3 */ + testl $PTI_SWITCH_MASK, \scratch_reg + jz .Lend_\@ + andl $(~PTI_SWITCH_MASK), \scratch_reg + movl \scratch_reg, %cr3 + /* Return original CR3 in \scratch_reg */ + orl $PTI_SWITCH_MASK, \scratch_reg +.Lend_\@: +.endm + +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS pushl %fs @@ -173,6 +220,29 @@ movl $(__KERNEL_PERCPU), %edx movl %edx, %fs SET_KERNEL_GS %edx + + /* Switch to kernel stack if necessary */ +.if \switch_stacks > 0 + SWITCH_TO_KERNEL_STACK +.endif + +.endm + +.macro SAVE_ALL_NMI cr3_reg:req + SAVE_ALL + + BUG_IF_WRONG_CR3 + + /* + * Now switch the CR3 when PTI is enabled. + * + * We can enter with either user or kernel cr3, the code will + * store the old cr3 in \cr3_reg and switches to the kernel cr3 + * if necessary. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg + +.Lend_\@: .endm /* @@ -221,6 +291,349 @@ POP_GS_EX .endm +.macro RESTORE_ALL_NMI cr3_reg:req pop=0 + /* + * Now switch the CR3 when PTI is enabled. + * + * We enter with kernel cr3 and switch the cr3 to the value + * stored on \cr3_reg, which is either a user or a kernel cr3. + */ + ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI + + testl $PTI_SWITCH_MASK, \cr3_reg + jz .Lswitched_\@ + + /* User cr3 in \cr3_reg - write it to hardware cr3 */ + movl \cr3_reg, %cr3 + +.Lswitched_\@: + + BUG_IF_WRONG_CR3 + + RESTORE_REGS pop=\pop +.endm + +.macro CHECK_AND_APPLY_ESPFIX +#ifdef CONFIG_X86_ESPFIX32 +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + + ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX + + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + jne .Lend_\@ # returning to user-space with LDT SS + + /* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + lss (%esp), %esp /* switch to espfix segment */ +.Lend_\@: +#endif /* CONFIG_X86_ESPFIX32 */ +.endm + +/* + * Called with pt_regs fully populated and kernel segments loaded, + * so we can access PER_CPU and use the integer registers. + * + * We need to be very careful here with the %esp switch, because an NMI + * can happen everywhere. If the NMI handler finds itself on the + * entry-stack, it will overwrite the task-stack and everything we + * copied there. So allocate the stack-frame on the task-stack and + * switch to it before we do any copying. + */ + +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) + +.macro SWITCH_TO_KERNEL_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + BUG_IF_WRONG_CR3 + + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + + /* + * %eax now contains the entry cr3 and we carry it forward in + * that register for the time this macro runs + */ + + /* Are we on the entry stack? Bail out if not! */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jae .Lend_\@ + + /* Load stack pointer into %esi and %edi */ + movl %esp, %esi + movl %esi, %edi + + /* Move %edi to the top of the entry stack */ + andl $(MASK_entry_stack), %edi + addl $(SIZEOF_entry_stack), %edi + + /* Load top of task-stack into %edi */ + movl TSS_entry2task_stack(%edi), %edi + + /* + * Clear unused upper bits of the dword containing the word-sized CS + * slot in pt_regs in case hardware didn't clear it for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + + /* Special case - entry from kernel mode via entry stack */ +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS + movb PT_CS(%esp), %cl + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx +#else + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx +#endif + cmpl $USER_RPL, %ecx + jb .Lentry_from_kernel_\@ + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) + jz .Lcopy_pt_regs_\@ + + /* + * Stack-frame contains 4 additional segment registers when + * coming from VM86 mode + */ + addl $(4 * 4), %ecx + +#endif +.Lcopy_pt_regs_\@: + + /* Allocate frame on task-stack */ + subl %ecx, %edi + + /* Switch to task-stack */ + movl %edi, %esp + + /* + * We are now on the task-stack and can safely copy over the + * stack-frame + */ + shrl $2, %ecx + cld + rep movsl + + jmp .Lend_\@ + +.Lentry_from_kernel_\@: + + /* + * This handles the case when we enter the kernel from + * kernel-mode and %esp points to the entry-stack. When this + * happens we need to switch to the task-stack to run C code, + * but switch back to the entry-stack again when we approach + * iret and return to the interrupted code-path. This usually + * happens when we hit an exception while restoring user-space + * segment registers on the way back to user-space or when the + * sysenter handler runs with eflags.tf set. + * + * When we switch to the task-stack here, we can't trust the + * contents of the entry-stack anymore, as the exception handler + * might be scheduled out or moved to another CPU. Therefore we + * copy the complete entry-stack to the task-stack and set a + * marker in the iret-frame (bit 31 of the CS dword) to detect + * what we've done on the iret path. + * + * On the iret path we copy everything back and switch to the + * entry-stack, so that the interrupted kernel code-path + * continues on the same stack it was interrupted with. + * + * Be aware that an NMI can happen anytime in this code. + * + * %esi: Entry-Stack pointer (same as %esp) + * %edi: Top of the task stack + * %eax: CR3 on kernel entry + */ + + /* Calculate number of bytes on the entry stack in %ecx */ + movl %esi, %ecx + + /* %ecx to the top of entry-stack */ + andl $(MASK_entry_stack), %ecx + addl $(SIZEOF_entry_stack), %ecx + + /* Number of bytes on the entry stack to %ecx */ + sub %esi, %ecx + + /* Mark stackframe as coming from entry stack */ + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + + /* + * Test the cr3 used to enter the kernel and add a marker + * so that we can switch back to it before iret. + */ + testl $PTI_SWITCH_MASK, %eax + jz .Lcopy_pt_regs_\@ + orl $CS_FROM_USER_CR3, PT_CS(%esp) + + /* + * %esi and %edi are unchanged, %ecx contains the number of + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate + * the stack-frame on task-stack and copy everything over + */ + jmp .Lcopy_pt_regs_\@ + +.Lend_\@: +.endm + +/* + * Switch back from the kernel stack to the entry stack. + * + * The %esp register must point to pt_regs on the task stack. It will + * first calculate the size of the stack-frame to copy, depending on + * whether we return to VM86 mode or not. With that it uses 'rep movsl' + * to copy the contents of the stack over to the entry stack. + * + * We must be very careful here, as we can't trust the contents of the + * task-stack once we switched to the entry-stack. When an NMI happens + * while on the entry-stack, the NMI handler will switch back to the top + * of the task stack, overwriting our stack-frame we are about to copy. + * Therefore we switch the stack only after everything is copied over. + */ +.macro SWITCH_TO_ENTRY_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp) + jz .Lcopy_pt_regs_\@ + + /* Additional 4 registers to copy when returning to VM86 mode */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Initialize source and destination for movsl */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + subl %ecx, %edi + movl %esp, %esi + + /* Save future stack pointer in %ebx */ + movl %edi, %ebx + + /* Copy over the stack-frame */ + shrl $2, %ecx + cld + rep movsl + + /* + * Switch to entry-stack - needs to happen after everything is + * copied because the NMI handler will overwrite the task-stack + * when on entry-stack + */ + movl %ebx, %esp + +.Lend_\@: +.endm + +/* + * This macro handles the case when we return to kernel-mode on the iret + * path and have to switch back to the entry stack and/or user-cr3 + * + * See the comments below the .Lentry_from_kernel_\@ label in the + * SWITCH_TO_KERNEL_STACK macro for more details. + */ +.macro PARANOID_EXIT_TO_KERNEL_MODE + + /* + * Test if we entered the kernel with the entry-stack. Most + * likely we did not, because this code only runs on the + * return-to-kernel path. + */ + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + jz .Lend_\@ + + /* Unlikely slow-path */ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) + + /* Copy the remaining task-stack contents to entry-stack */ + movl %esp, %esi + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + + /* Bytes on the task-stack to ecx */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx + subl %esi, %ecx + + /* Allocate stack-frame on entry-stack */ + subl %ecx, %edi + + /* + * Save future stack-pointer, we must not switch until the + * copy is done, otherwise the NMI handler could destroy the + * contents of the task-stack we are about to copy. + */ + movl %edi, %ebx + + /* Do the copy */ + shrl $2, %ecx + cld + rep movsl + + /* Safe to switch to entry-stack now */ + movl %ebx, %esp + + /* + * We came from entry-stack and need to check if we also need to + * switch back to user cr3. + */ + testl $CS_FROM_USER_CR3, PT_CS(%esp) + jz .Lend_\@ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_USER_CR3), PT_CS(%esp) + + SWITCH_TO_USER_CR3 scratch_reg=%eax + +.Lend_\@: +.endm /* * %eax: prev task * %edx: next task @@ -351,9 +764,9 @@ ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) .Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all + jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_all_kernel call preempt_schedule_irq jmp .Lneed_resched END(resume_kernel) @@ -412,7 +825,21 @@ ENTRY(xen_sysenter_target) * 0(%ebp) arg6 */ ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp), %esp + /* + * On entry-stack with all userspace-regs live - save and + * restore eflags and %eax to use it as scratch-reg for the cr3 + * switch. + */ + pushfl + pushl %eax + BUG_IF_WRONG_CR3 no_user_check=1 + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + popl %eax + popfl + + /* Stack empty again, switch to task stack */ + movl TSS_entry2task_stack(%esp), %esp + .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ @@ -421,7 +848,7 @@ ENTRY(entry_SYSENTER_32) pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ /* * SYSENTER doesn't filter flags, so we need to clear NT, AC @@ -460,25 +887,49 @@ ENTRY(entry_SYSENTER_32) /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ + + /* + * Setup entry stack - we keep the pointer in %eax and do the + * switch after almost all user-state is restored. + */ + + /* Load entry stack pointer and allocate frame for eflags/eax */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax + subl $(2*4), %eax + + /* Copy eflags and eax to entry stack */ + movl PT_EFLAGS(%esp), %edi + movl PT_EAX(%esp), %esi + movl %edi, (%eax) + movl %esi, 4(%eax) + + /* Restore user registers and segments */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS + popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ popl %esi /* pt_regs->si */ popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ - popl %eax /* pt_regs->ax */ + + /* Switch to entry stack */ + movl %eax, %esp + + /* Now ready to switch the cr3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax /* * Restore all flags except IF. (We restore IF separately because * STI gives a one-instruction window in which we won't be interrupted, * whereas POPF does not.) */ - addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ btrl $X86_EFLAGS_IF_BIT, (%esp) + BUG_IF_WRONG_CR3 no_user_check=1 popfl + popl %eax /* * Return back to the vDSO, which will pop ecx and edx. @@ -532,7 +983,8 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + + SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -546,24 +998,17 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET + SWITCH_TO_ENTRY_STACK .Lrestore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX - - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - /* - * Warning: PT_OLDSS(%esp) contains the wrong/random values if we - * are returning to the kernel. - * See comments in process.c:copy_thread() for details. - */ - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je .Lldt_ss # returning to user-space with LDT SS -#endif + CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code + /* Switch back to user CR3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + + BUG_IF_WRONG_CR3 + + /* Restore user state */ + RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -572,46 +1017,33 @@ restore_all: */ INTERRUPT_RETURN +restore_all_kernel: + TRACE_IRQS_IRET + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error - jmp common_exception -.previous - _ASM_EXTABLE(.Lirq_return, iret_exc) -#ifdef CONFIG_X86_ESPFIX32 -.Lldt_ss: -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ +#ifdef CONFIG_DEBUG_ENTRY /* - * Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the IRET: + * The stack-frame here is the one that iret faulted on, so its a + * return-to-user frame. We are on kernel-cr3 because we come here from + * the fixup code. This confuses the CR3 checker, so switch to user-cr3 + * as the checker expects it. */ - DISABLE_INTERRUPTS(CLBR_ANY) - lss (%esp), %esp /* switch to espfix segment */ - jmp .Lrestore_nocheck + pushl %eax + SWITCH_TO_USER_CR3 scratch_reg=%eax + popl %eax #endif + + jmp common_exception +.previous + _ASM_EXTABLE(.Lirq_return, iret_exc) ENDPROC(entry_INT80_32) .macro FIXUP_ESPFIX_STACK @@ -671,7 +1103,8 @@ END(irq_entries_start) common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax @@ -679,16 +1112,16 @@ common_interrupt: jmp ret_from_intr ENDPROC(common_interrupt) -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL switch_stacks=1; \ + ENCODE_FRAME_POINTER; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) \ @@ -920,16 +1353,20 @@ common_exception: pushl %es pushl %ds pushl %eax + movl $(__USER_DS), %eax + movl %eax, %ds + movl %eax, %es + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs pushl %ebp pushl %edi pushl %esi pushl %edx pushl %ecx pushl %ebx + SWITCH_TO_KERNEL_STACK ENCODE_FRAME_POINTER cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address @@ -937,9 +1374,6 @@ common_exception: movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi @@ -948,40 +1382,12 @@ END(common_exception) ENTRY(debug) /* - * #DB can happen at the first instruction of - * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this - * happens, then we will be running on a very small stack. We - * need to detect this condition and switch to the thread - * stack before calling any C code at all. - * - * If you edit this code, keep in mind that NMIs can happen in here. + * Entry from sysenter is now handled in common_exception */ ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL - ENCODE_FRAME_POINTER - xorl %edx, %edx # error code 0 - movl %esp, %eax # pt_regs pointer - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx - subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ - cmpl $SIZEOF_entry_stack, %ecx - jb .Ldebug_from_sysenter_stack - - TRACE_IRQS_OFF - call do_debug - jmp ret_from_exception - -.Ldebug_from_sysenter_stack: - /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - TRACE_IRQS_OFF - call do_debug - movl %ebx, %esp - jmp ret_from_exception + pushl $do_debug + jmp common_exception END(debug) /* @@ -993,6 +1399,7 @@ END(debug) */ ENTRY(nmi) ASM_CLAC + #ifdef CONFIG_X86_ESPFIX32 pushl %eax movl %ss, %eax @@ -1002,7 +1409,7 @@ ENTRY(nmi) #endif pushl %eax # pt_regs->orig_ax - SAVE_ALL + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1016,7 +1423,7 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp .Lrestore_all_notrace + jmp .Lnmi_return .Lnmi_from_sysenter_stack: /* @@ -1027,7 +1434,11 @@ ENTRY(nmi) movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi movl %ebx, %esp - jmp .Lrestore_all_notrace + +.Lnmi_return: + CHECK_AND_APPLY_ESPFIX + RESTORE_ALL_NMI cr3_reg=%edi pop=4 + jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: @@ -1042,12 +1453,12 @@ ENTRY(nmi) pushl 16(%esp) .endr pushl %eax - SAVE_ALL + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi - RESTORE_REGS + RESTORE_ALL_NMI cr3_reg=%edi lss 12+4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif @@ -1056,7 +1467,8 @@ END(nmi) ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73a522d53b53..957dfb693ecc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ END(native_usergs_sysret64) .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9, EFLAGS(%rsp) /* interrupts off? */ + btl $9, EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -408,6 +408,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ + UNWIND_HINT_EMPTY movq %r12, %rdi CALL_NOSPEC %rbx /* @@ -701,7 +702,7 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9, EFLAGS(%rsp) /* were interrupts off? */ + btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f 0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f @@ -981,7 +982,7 @@ ENTRY(\sym) call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif END(\sym) .endm @@ -1222,7 +1223,6 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. - * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC @@ -1269,7 +1269,6 @@ ENTRY(error_entry) * for these here too. */ .Lerror_kernelspace: - incl %ebx leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) je .Lerror_bad_iret @@ -1303,28 +1302,20 @@ ENTRY(error_entry) /* * Pretend that the exception came from user mode: set up pt_regs - * as if we faulted immediately after IRET and clear EBX so that - * error_exit knows that we will be returning to user mode. + * as if we faulted immediately after IRET. */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp - decl %ebx jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) - -/* - * On entry, EBX is a "return to kernel mode" flag: - * 1: already in kernel mode, don't need SWAPGS - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ ENTRY(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %ebx, %ebx - jnz retint_kernel + testb $3, CS(%rsp) + jz retint_kernel jmp retint_user END(error_exit) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 261802b1cc50..9f695f517747 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -46,10 +46,8 @@ targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ - -Wl,--no-undefined \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ - $(DISABLE_LTO) +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ + -z max-page-size=4096 -z common-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) @@ -58,9 +56,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(src hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ -define cmd_vdso2c - $(obj)/vdso2c $< $(<:%.dbg=%) $@ -endef + cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE $(call if_changed,vdso2c) @@ -95,10 +91,8 @@ CFLAGS_REMOVE_vvar.o = -pg # CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ - -Wl,-soname=linux-vdso.so.1 \ - -Wl,-z,max-page-size=4096 \ - -Wl,-z,common-page-size=4096 +VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ + -z max-page-size=4096 -z common-page-size=4096 # x32-rebranded versions vobjx32s-y := $(vobjs-y:.o=-x32.o) @@ -123,7 +117,7 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o @@ -157,13 +151,13 @@ $(obj)/vdso32.so.dbg: FORCE \ # The DSO images are built using a special linker script. # quiet_cmd_vdso = VDSO $@ - cmd_vdso = $(CC) -nostdlib -o $@ \ + cmd_vdso = $(LD) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ - $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) +VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ + $(call ld-option, --build-id) -Bsymbolic GCOV_PROFILE := n # diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S index 79a071e4357e..79423170118f 100644 --- a/arch/x86/entry/vdso/vdso-note.S +++ b/arch/x86/entry/vdso/vdso-note.S @@ -3,6 +3,7 @@ * Here we can supply some information useful to userland. */ +#include <linux/build-salt.h> #include <linux/uts.h> #include <linux/version.h> #include <linux/elfnote.h> @@ -10,3 +11,5 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +BUILD_SALT diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index 9fd51f206314..e78047d119f6 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -4,6 +4,7 @@ * Here we can supply some information useful to userland. */ +#include <linux/build-salt.h> #include <linux/version.h> #include <linux/elfnote.h> @@ -14,6 +15,8 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END +BUILD_SALT + #ifdef CONFIG_XEN /* * Add a special note telling glibc's dynamic linker a fake hardware diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4b98101209a1..d50bb4dc0650 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); struct perf_event *event = pcpu->event; - struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event *hwc; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; @@ -602,6 +602,10 @@ fail: return 0; } + if (WARN_ON_ONCE(!event)) + goto fail; + + hwc = &event->hw; msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 707b2a96e516..035c37481f57 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2041,15 +2041,15 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; } x86_pmu_disable_event(event); - - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); } static void intel_pmu_del_event(struct perf_event *event) @@ -2068,17 +2068,19 @@ static void intel_pmu_read_event(struct perf_event *event) x86_perf_event_update(event); } -static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) +static void intel_pmu_enable_fixed(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; + u64 ctrl_val, mask, bits = 0; /* - * Enable IRQ generation (0x8), + * Enable IRQ generation (0x8), if not PEBS, * and enable ring-3 counting (0x2) and ring-0 counting (0x1) * if requested: */ - bits = 0x8ULL; + if (!event->attr.precise_ip) + bits |= 0x8; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) @@ -2120,14 +2122,14 @@ static void intel_pmu_enable_event(struct perf_event *event) if (unlikely(event_is_checkpointed(event))) cpuc->intel_cp_status |= (1ull << hwc->idx); + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_enable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc); + intel_pmu_enable_fixed(event); return; } - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } @@ -2280,7 +2282,10 @@ again: * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + status &= ~cpuc->pebs_enabled; + else + status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); /* * PEBS overflow sets bit 62 in the global status register @@ -2997,6 +3002,9 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { @@ -4069,7 +4077,6 @@ __init int intel_pmu_init(void) intel_pmu_lbr_init_skl(); x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_glp_pebs_event_constraints; x86_pmu.extra_regs = intel_glm_extra_regs; /* * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS @@ -4079,6 +4086,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8a10a045b57b..b7b01d762d32 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu) ds->bts_buffer_base = (unsigned long) cea; ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); - ds->bts_absolute_maximum = ds->bts_buffer_base + max; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + (max / 16) * BTS_RECORD_SIZE; return 0; } @@ -711,12 +713,6 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -struct event_constraint intel_glp_pebs_event_constraints[] = { - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -869,6 +865,13 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) } } + /* + * Extended PEBS support + * Makes the PEBS code search the normal constraints. + */ + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + return NULL; + return &emptyconstraint; } @@ -894,10 +897,16 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; u64 threshold; + int reserved; + + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; + else + reserved = x86_pmu.max_pebs_events; if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + reserved * x86_pmu.pebs_record_size; } else { threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; } @@ -961,7 +970,11 @@ void intel_pmu_pebs_enable(struct perf_event *event) * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - ds->pebs_event_reset[hwc->idx] = + unsigned int idx = hwc->idx; + + if (idx >= INTEL_PMC_IDX_FIXED) + idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + ds->pebs_event_reset[idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } else { ds->pebs_event_reset[hwc->idx] = 0; @@ -1184,16 +1197,20 @@ static void setup_pebs_sample_data(struct perf_event *event, } /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happend between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + /* * We use the interrupt regs as a base because the PEBS record does not * contain a full regs set, specifically it seems to lack segment * descriptors, which get used by things like user_mode(). * * In the simple case fix up only the IP for PERF_SAMPLE_IP. - * - * We must however always use BP,SP from iregs for the unwinder to stay - * sane; the record BP,SP can point into thin air when the record is - * from a previous PMI context or an (I)RET happend between the record - * and PMI. */ *regs = *iregs; @@ -1212,15 +1229,8 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->si = pebs->si; regs->di = pebs->di; - /* - * Per the above; only set BP,SP if we don't need callchains. - * - * XXX: does this make sense? - */ - if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { - regs->bp = pebs->bp; - regs->sp = pebs->sp; - } + regs->bp = pebs->bp; + regs->sp = pebs->sp; #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; @@ -1482,9 +1492,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; - short counts[MAX_PEBS_EVENTS] = {}; - short error[MAX_PEBS_EVENTS] = {}; - int bit, i; + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + int bit, i, size; + u64 mask; if (!x86_pmu.pebs_active) return; @@ -1494,6 +1505,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; + mask = (1ULL << x86_pmu.max_pebs_events) - 1; + size = x86_pmu.max_pebs_events; + if (x86_pmu.flags & PMU_FL_PEBS_ALL) { + mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; + size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + } + if (unlikely(base >= top)) { /* * The drain_pebs() could be called twice in a short period @@ -1503,7 +1521,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * update the event->count for this case. */ for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, - x86_pmu.max_pebs_events) { + size) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -1516,12 +1534,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) u64 pebs_status; pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + pebs_status &= mask; /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { for_each_set_bit(bit, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) + size) counts[bit]++; continue; @@ -1569,7 +1587,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) counts[bit]++; } - for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { + for (bit = 0; bit < size; bit++) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index cf372b90557e..f3e006bed9a7 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void) void intel_pmu_lbr_reset(void) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (!x86_pmu.lbr_nr) return; @@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_32(); else intel_pmu_lbr_reset_64(); + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; } /* @@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx) static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; unsigned lbr_idx, mask; u64 tos; @@ -344,9 +350,21 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) return; } - mask = x86_pmu.lbr_nr - 1; tos = task_ctx->tos; - for (i = 0; i < tos; i++) { + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Did not enter C6 + */ + if ((task_ctx == cpuc->last_task_ctx) && + (task_ctx->log_id == cpuc->last_log_id) && + rdlbr_from(tos)) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + mask = x86_pmu.lbr_nr - 1; + for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); @@ -354,14 +372,24 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + + for (; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrlbr_from(lbr_idx, 0); + wrlbr_to(lbr_idx, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); unsigned lbr_idx, mask; - u64 tos; + u64 tos, from; int i; if (task_ctx->lbr_callstack_users == 0) { @@ -371,15 +399,22 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < tos; i++) { + for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - task_ctx->lbr_from[i] = rdlbr_from(lbr_idx); + from = rdlbr_from(lbr_idx); + if (!from) + break; + task_ctx->lbr_from[i] = from; task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->valid_lbrs = i; task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; + + cpuc->last_task_ctx = task_ctx; + cpuc->last_log_id = ++task_ctx->log_id; } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) @@ -531,7 +566,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) */ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { - bool need_info = false; + bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); @@ -542,7 +577,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (cpuc->lbr_sel) { need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + call_stack = true; } for (i = 0; i < num; i++) { @@ -555,6 +590,13 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) from = rdlbr_from(lbr_idx); to = rdlbr_to(lbr_idx); + /* + * Read LBR call stack entries + * until invalid entry (0s) is detected. + */ + if (call_stack && !from) + break; + if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index c9e1e0bef3c3..e17ab885b1e9 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -28,7 +28,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 3 +#define UNCORE_EXTRA_PCI_DEV_MAX 4 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 87dc0263a2e1..51d7c117e3c7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1029,6 +1029,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + BDX_PCI_QPI_PORT2_FILTER, HSWEP_PCI_PCU_3, }; @@ -3286,15 +3287,18 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { }, { /* QPI Port 0 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), }, { /* QPI Port 1 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* QPI Port 2 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + BDX_PCI_QPI_PORT2_FILTER), }, { /* PCU.3 (for Capability registers) */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9f3711470ec1..156286335351 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -163,6 +163,7 @@ struct intel_excl_cntrs { unsigned core_id; /* per-core: core id */ }; +struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 enum { @@ -214,6 +215,8 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; u64 br_sel; + struct x86_perf_task_context *last_task_ctx; + int last_log_id; /* * Intel host/guest exclude bits @@ -648,8 +651,10 @@ struct x86_perf_task_context { u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; int tos; + int valid_lbrs; int lbr_callstack_users; int lbr_stack_state; + int log_id; }; #define x86_add_quirk(func_) \ @@ -668,6 +673,7 @@ do { \ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ +#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index b173d404e3df..b21ee65c4101 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,2 +1,2 @@ -obj-y := hv_init.o mmu.o +obj-y := hv_init.o mmu.o nested.o obj-$(CONFIG_X86_64) += hv_apic.o diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index f68855499391..5b0f613428c2 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -31,6 +31,8 @@ #include <asm/mshyperv.h> #include <asm/apic.h> +#include <asm/trace/hyperv.h> + static struct apic orig_apic; static u64 hv_apic_icr_read(void) @@ -99,6 +101,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) int nr_bank = 0; int ret = 1; + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + local_irq_save(flags); arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); @@ -114,6 +119,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); } + if (nr_bank < 0) + goto ipi_mask_ex_done; if (!nr_bank) ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; @@ -128,10 +135,10 @@ ipi_mask_ex_done: static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; - struct ipi_arg_non_ex **arg; - struct ipi_arg_non_ex *ipi_arg; + struct ipi_arg_non_ex ipi_arg; int ret = 1; - unsigned long flags; + + trace_hyperv_send_ipi_mask(mask, vector); if (cpumask_empty(mask)) return true; @@ -142,37 +149,43 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return false; - if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - return __send_ipi_mask_ex(mask, vector); - - local_irq_save(flags); - arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); - - ipi_arg = *arg; - if (unlikely(!ipi_arg)) - goto ipi_mask_done; - - ipi_arg->vector = vector; - ipi_arg->reserved = 0; - ipi_arg->cpu_mask = 0; + /* + * From the supplied CPU set we need to figure out if we can get away + * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the + * highest VP number in the set is < 64. As VP numbers are usually in + * ascending order and match Linux CPU ids, here is an optimization: + * we check the VP number for the highest bit in the supplied set first + * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is + * a must. We will also check all VP numbers when walking the supplied + * CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64) + goto do_ex_hypercall; + + ipi_arg.vector = vector; + ipi_arg.cpu_mask = 0; for_each_cpu(cur_cpu, mask) { vcpu = hv_cpu_number_to_vp_number(cur_cpu); + if (vcpu == VP_INVAL) + return false; + /* * This particular version of the IPI hypercall can * only target upto 64 CPUs. */ if (vcpu >= 64) - goto ipi_mask_done; + goto do_ex_hypercall; - __set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask); + __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask); } - ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL); - -ipi_mask_done: - local_irq_restore(flags); + ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, + ipi_arg.cpu_mask); return ((ret == 0) ? true : false); + +do_ex_hypercall: + return __send_ipi_mask_ex(mask, vector); } static bool __send_ipi_one(int cpu, int vector) @@ -228,10 +241,7 @@ static void hv_send_ipi_self(int vector) void __init hv_apic_init(void) { if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { - if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - pr_info("Hyper-V: Using ext hypercalls for IPI\n"); - else - pr_info("Hyper-V: Using IPI hypercalls\n"); + pr_info("Hyper-V: Using IPI hypercalls\n"); /* * Set the IPI entry points. */ diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 4c431e1c1eff..20c876c7c5bf 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -265,7 +265,7 @@ void __init hyperv_init(void) { u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; - int cpuhp; + int cpuhp, i; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; @@ -293,6 +293,9 @@ void __init hyperv_init(void) if (!hv_vp_index) return; + for (i = 0; i < num_possible_cpus(); i++) + hv_vp_index[i] = VP_INVAL; + hv_vp_assist_page = kcalloc(num_possible_cpus(), sizeof(*hv_vp_assist_page), GFP_KERNEL); if (!hv_vp_assist_page) { @@ -330,7 +333,7 @@ void __init hyperv_init(void) * Register Hyper-V specific clocksource. */ #ifdef CONFIG_HYPERV_TSCPAGE - if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { + if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { union hv_x64_msr_hypercall_contents tsc_msr; tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); @@ -359,7 +362,7 @@ register_msr_cs: */ hyperv_cs = &hyperv_cs_msr; - if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); return; @@ -423,6 +426,33 @@ void hyperv_report_panic(struct pt_regs *regs, long err) } EXPORT_SYMBOL_GPL(hyperv_report_panic); +/** + * hyperv_report_panic_msg - report panic message to Hyper-V + * @pa: physical address of the panic page containing the message + * @size: size of the message in the page + */ +void hyperv_report_panic_msg(phys_addr_t pa, size_t size) +{ + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + wrmsrl(HV_X64_MSR_CRASH_P0, 0); + wrmsrl(HV_X64_MSR_CRASH_P1, 0); + wrmsrl(HV_X64_MSR_CRASH_P2, 0); + wrmsrl(HV_X64_MSR_CRASH_P3, pa); + wrmsrl(HV_X64_MSR_CRASH_P4, size); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); + bool hv_is_hyperv_initialized(void) { union hv_x64_msr_hypercall_contents hypercall_msr; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index de27615c51ea..1147e1fed7ff 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -16,6 +16,8 @@ /* Each gva in gva_list encodes up to 4096 pages to flush */ #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info); /* * Fills in gva_list starting from offset. Returns the number of items added. @@ -93,10 +95,29 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, if (cpumask_equal(cpus, cpu_present_mask)) { flush->flags |= HV_FLUSH_ALL_PROCESSORS; } else { + /* + * From the supplied CPU set we need to figure out if we can get + * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE} + * hypercalls. This is possible when the highest VP number in + * the set is < 64. As VP numbers are usually in ascending order + * and match Linux CPU ids, here is an optimization: we check + * the VP number for the highest bit in the supplied set first + * so we can quickly find out if using *_EX hypercalls is a + * must. We will also check all VP numbers when walking the + * supplied CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + goto do_ex_hypercall; + for_each_cpu(cpu, cpus) { vcpu = hv_cpu_number_to_vp_number(cpu); - if (vcpu >= 64) + if (vcpu == VP_INVAL) { + local_irq_restore(flags); goto do_native; + } + + if (vcpu >= 64) + goto do_ex_hypercall; __set_bit(vcpu, (unsigned long *) &flush->processor_mask); @@ -123,7 +144,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, gva_n, 0, flush, NULL); } + goto check_status; + +do_ex_hypercall: + status = hyperv_flush_tlb_others_ex(cpus, info); +check_status: local_irq_restore(flags); if (!(status & HV_HYPERCALL_RESULT_MASK)) @@ -132,35 +158,22 @@ do_native: native_flush_tlb_others(cpus, info); } -static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, - const struct flush_tlb_info *info) +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; struct hv_tlb_flush_ex **flush_pcpu; struct hv_tlb_flush_ex *flush; - u64 status = U64_MAX; - unsigned long flags; + u64 status; - trace_hyperv_mmu_flush_tlb_others(cpus, info); - - if (!hv_hypercall_pg) - goto do_native; - - if (cpumask_empty(cpus)) - return; - - local_irq_save(flags); + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return U64_MAX; flush_pcpu = (struct hv_tlb_flush_ex **) this_cpu_ptr(hyperv_pcpu_input_arg); flush = *flush_pcpu; - if (unlikely(!flush)) { - local_irq_restore(flags); - goto do_native; - } - if (info->mm) { /* * AddressSpace argument must match the CR3 with PCID bits @@ -176,15 +189,10 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->hv_vp_set.valid_bank_mask = 0; - if (!cpumask_equal(cpus, cpu_present_mask)) { - flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); - } - - if (!nr_bank) { - flush->hv_vp_set.format = HV_GENERIC_SET_ALL; - flush->flags |= HV_FLUSH_ALL_PROCESSORS; - } + flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); + if (nr_bank < 0) + return U64_MAX; /* * We can flush not more than max_gvas with one hypercall. Flush the @@ -213,12 +221,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, gva_n, nr_bank, flush, NULL); } - local_irq_restore(flags); - - if (!(status & HV_HYPERCALL_RESULT_MASK)) - return; -do_native: - native_flush_tlb_others(cpus, info); + return status; } void hyperv_setup_mmu_ops(void) @@ -226,11 +229,6 @@ void hyperv_setup_mmu_ops(void) if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) return; - if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { - pr_info("Using hypercall for remote TLB flush\n"); - pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; - } else { - pr_info("Using ext hypercall for remote TLB flush\n"); - pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; - } + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; } diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000000..b8e60cc50461 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> + */ + + +#include <linux/types.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/tlbflush.h> + +#include <asm/trace/hyperv.h> + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 74a9e06b6cfd..130e81e10fc7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -10,6 +10,7 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> +#include <asm/hardirq.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -502,12 +503,19 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_SMP +bool apic_id_is_primary_thread(unsigned int id); +#else +static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } +#endif + extern void irq_enter(void); extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void entering_ack_irq(void) @@ -520,6 +528,7 @@ static inline void ipi_entering_ack_irq(void) { irq_enter(); ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void exiting_irq(void) diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index c356098b6fb9..4d4015ddcf26 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -7,8 +7,6 @@ #ifndef _ASM_X86_MACH_DEFAULT_APM_H #define _ASM_X86_MACH_DEFAULT_APM_H -#include <asm/nospec-branch.h> - #ifdef APM_ZERO_SEGS # define APM_DO_ZERO_SEGS \ "pushl %%ds\n\t" \ @@ -34,7 +32,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -47,7 +44,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, "=S" (*esi) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); } static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, @@ -60,7 +56,6 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -73,7 +68,6 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, "=S" (si) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); return error; } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..990770f9e76b 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -46,6 +46,65 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +#ifndef __x86_64__ +/* 32 bit */ + +#define _ASM_ARG1 _ASM_AX +#define _ASM_ARG2 _ASM_DX +#define _ASM_ARG3 _ASM_CX + +#define _ASM_ARG1L eax +#define _ASM_ARG2L edx +#define _ASM_ARG3L ecx + +#define _ASM_ARG1W ax +#define _ASM_ARG2W dx +#define _ASM_ARG3W cx + +#define _ASM_ARG1B al +#define _ASM_ARG2B dl +#define _ASM_ARG3B cl + +#else +/* 64 bit */ + +#define _ASM_ARG1 _ASM_DI +#define _ASM_ARG2 _ASM_SI +#define _ASM_ARG3 _ASM_DX +#define _ASM_ARG4 _ASM_CX +#define _ASM_ARG5 r8 +#define _ASM_ARG6 r9 + +#define _ASM_ARG1Q rdi +#define _ASM_ARG2Q rsi +#define _ASM_ARG3Q rdx +#define _ASM_ARG4Q rcx +#define _ASM_ARG5Q r8 +#define _ASM_ARG6Q r9 + +#define _ASM_ARG1L edi +#define _ASM_ARG2L esi +#define _ASM_ARG3L edx +#define _ASM_ARG4L ecx +#define _ASM_ARG5L r8d +#define _ASM_ARG6L r9d + +#define _ASM_ARG1W di +#define _ASM_ARG2W si +#define _ASM_ARG3W dx +#define _ASM_ARG4W cx +#define _ASM_ARG5W r8w +#define _ASM_ARG6W r9w + +#define _ASM_ARG1B dil +#define _ASM_ARG2B sil +#define _ASM_ARG3B dl +#define _ASM_ARG4B cl +#define _ASM_ARG5B r8b +#define _ASM_ARG6B r9b + +#endif + /* * Macros to generate condition code outputs from inline assembly, * The output operand must be type "bool". diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 0db6bec95489..b143717b92b3 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -80,6 +80,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ +#define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); @@ -91,6 +92,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ +#define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" @@ -103,6 +105,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic_dec arch_atomic_dec static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" @@ -117,6 +120,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ +#define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); @@ -130,6 +134,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ +#define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); @@ -144,6 +149,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ +#define arch_atomic_add_negative arch_atomic_add_negative static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); @@ -173,9 +179,6 @@ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) return arch_atomic_add_return(-i, v); } -#define arch_atomic_inc_return(v) (arch_atomic_add_return(1, v)) -#define arch_atomic_dec_return(v) (arch_atomic_sub_return(1, v)) - static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); @@ -199,7 +202,7 @@ static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int n static inline int arch_atomic_xchg(atomic_t *v, int new) { - return xchg(&v->counter, new); + return arch_xchg(&v->counter, new); } static inline void arch_atomic_and(int i, atomic_t *v) @@ -253,27 +256,6 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) return val; } -/** - * __arch_atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns the old value of @v. - */ -static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) -{ - int c = arch_atomic_read(v); - - do { - if (unlikely(c == u)) - break; - } while (!arch_atomic_try_cmpxchg(v, &c, c + a)); - - return c; -} - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 92212bf0484f..ef959f02d070 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -158,6 +158,7 @@ static inline long long arch_atomic64_inc_return(atomic64_t *v) "S" (v) : "memory", "ecx"); return a; } +#define arch_atomic64_inc_return arch_atomic64_inc_return static inline long long arch_atomic64_dec_return(atomic64_t *v) { @@ -166,6 +167,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v) "S" (v) : "memory", "ecx"); return a; } +#define arch_atomic64_dec_return arch_atomic64_dec_return /** * arch_atomic64_add - add integer to atomic64 variable @@ -198,25 +200,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v) } /** - * arch_atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v) -{ - return arch_atomic64_sub_return(i, v) == 0; -} - -/** * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ +#define arch_atomic64_inc arch_atomic64_inc static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, @@ -229,6 +218,7 @@ static inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic64_dec arch_atomic64_dec static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, @@ -236,46 +226,6 @@ static inline void arch_atomic64_dec(atomic64_t *v) } /** - * arch_atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int arch_atomic64_dec_and_test(atomic64_t *v) -{ - return arch_atomic64_dec_return(v) == 0; -} - -/** - * atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int arch_atomic64_inc_and_test(atomic64_t *v) -{ - return arch_atomic64_inc_return(v) == 0; -} - -/** - * arch_atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int arch_atomic64_add_negative(long long i, atomic64_t *v) -{ - return arch_atomic64_add_return(i, v) < 0; -} - -/** * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... @@ -295,7 +245,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, return (int)a; } - +#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; @@ -304,6 +254,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) return r; } +#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6106b59d3260..4343d9b4f30e 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -71,6 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ +#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); @@ -82,6 +83,7 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ +#define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" @@ -95,6 +97,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic64_dec arch_atomic64_dec static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" @@ -110,6 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ +#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); @@ -123,6 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ +#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); @@ -137,6 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ +#define arch_atomic64_add_negative arch_atomic64_add_negative static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); @@ -169,9 +175,6 @@ static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) return xadd(&v->counter, -i); } -#define arch_atomic64_inc_return(v) (arch_atomic64_add_return(1, (v))) -#define arch_atomic64_dec_return(v) (arch_atomic64_sub_return(1, (v))) - static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) { return arch_cmpxchg(&v->counter, old, new); @@ -185,46 +188,7 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, l static inline long arch_atomic64_xchg(atomic64_t *v, long new) { - return xchg(&v->counter, new); -} - -/** - * arch_atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns the old value of @v. - */ -static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u) -{ - s64 c = arch_atomic64_read(v); - do { - if (unlikely(c == u)) - return false; - } while (!arch_atomic64_try_cmpxchg(v, &c, c + a)); - return true; -} - -#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0) - -/* - * arch_atomic64_dec_if_positive - decrement by 1 if old value positive - * @v: pointer of type atomic_t - * - * The function returns the old value of *v minus 1, even if - * the atomic variable, v, was not decremented. - */ -static inline long arch_atomic64_dec_if_positive(atomic64_t *v) -{ - s64 dec, c = arch_atomic64_read(v); - do { - dec = c - 1; - if (unlikely(dec < 0)) - break; - } while (!arch_atomic64_try_cmpxchg(v, &c, dec)); - return dec; + return arch_xchg(&v->counter, new); } static inline void arch_atomic64_and(long i, atomic64_t *v) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index e3efd8a06066..a55d79b233d3 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -75,7 +75,7 @@ extern void __add_wrong_size(void) * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") +#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") /* * Atomic compare and exchange. Compare OLD with MEM, if identical, diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index bfca3b346c74..072e5459fe2f 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -10,13 +10,13 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) #define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg((ptr), (o), (n)); \ + arch_cmpxchg((ptr), (o), (n)); \ }) #define arch_cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg_local((ptr), (o), (n)); \ + arch_cmpxchg_local((ptr), (o), (n)); \ }) #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5701f5cecd31..89a048c2faec 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,6 +219,8 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -229,7 +231,7 @@ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - +#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -341,6 +343,7 @@ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ @@ -373,5 +376,6 @@ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 0ab2ab27ad1f..b825cb201251 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -4,8 +4,8 @@ #include <linux/compiler.h> #include <linux/init.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static __always_inline __init void *dmi_alloc(unsigned len) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 740a428acf1e..d9069bb26c7f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,10 +3,12 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> -#include <linux/irq.h> typedef struct { - unsigned int __softirq_pending; + u16 __softirq_pending; +#if IS_ENABLED(CONFIG_KVM_INTEL) + u8 kvm_cpu_l1tf_flush_l1d; +#endif unsigned int __nmi_count; /* arch dependent */ #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ @@ -58,4 +60,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static inline void kvm_set_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +} + +static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); +} + +static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +{ + return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); +} +#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ +static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ + #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index f59c39835a5a..a1f0e90d0818 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -49,11 +49,14 @@ static inline int hw_breakpoint_slots(int type) return HBP_NUM; } +struct perf_event_attr; struct perf_event; struct pmu; -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); -extern int arch_validate_hwbkpt_settings(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); +extern int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index b8c89265baf0..e977b6b3a538 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -35,9 +35,9 @@ /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) /* Partition reference TSC MSR is available */ -#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) +#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 @@ -60,7 +60,7 @@ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through * HV_X64_MSR_STIMER3_COUNT) available */ -#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) /* * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) * are available @@ -86,7 +86,7 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) /* stimer Direct Mode is available */ -#define HV_X64_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) +#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) /* * Feature identification: EBX indicates which flags were specified at @@ -160,9 +160,9 @@ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) /* - * Virtual APIC support + * Recommend not using Auto End-Of-Interrupt feature */ -#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) /* * Recommend using cluster IPI hypercalls. @@ -176,9 +176,10 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) /* - * Crash notification flag. + * Crash notification flags. */ -#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) /* MSR used to identify the guest OS. */ #define HV_X64_MSR_GUEST_OS_ID 0x40000000 @@ -309,6 +310,7 @@ struct ms_hyperv_tsc_page { #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 /* Nested features (CPUID 0x4000000A) EAX */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) struct hv_reenlightenment_control { @@ -350,6 +352,7 @@ struct hv_tsc_emulation_status { #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -741,6 +744,12 @@ struct ipi_arg_ex { struct hv_vpset vp_set; }; +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +}; + /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ struct hv_tlb_flush { u64 address_space; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 5cdcdbd4d892..89789e8c80f6 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -3,6 +3,7 @@ #define _ASM_X86_I8259_H #include <linux/delay.h> +#include <asm/io.h> extern unsigned int cached_irq_mask; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cf090e584202..7ed08a7c3398 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -76,4 +76,17 @@ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +/* Useful macros */ +#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ +{ \ + .vendor = X86_VENDOR_INTEL, \ + .family = _family, \ + .model = _model, \ + .feature = X86_FEATURE_ANY, \ + .driver_data = (kernel_ulong_t)&_driver_data \ +} + +#define INTEL_CPU_FAM6(_model, _driver_data) \ + INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data) + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index fe04491130ae..52f815a80539 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -80,35 +80,6 @@ enum intel_mid_cpu_type { extern enum intel_mid_cpu_type __intel_mid_cpu_chip; -/** - * struct intel_mid_ops - Interface between intel-mid & sub archs - * @arch_setup: arch_setup function to re-initialize platform - * structures (x86_init, x86_platform_init) - * - * This structure can be extended if any new interface is required - * between intel-mid & its sub arch files. - */ -struct intel_mid_ops { - void (*arch_setup)(void); -}; - -/* Helper API's for INTEL_MID_OPS_INIT */ -#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ - [cpuid] = get_##cpuname##_ops - -/* Maximum number of CPU ops */ -#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) - -/* - * For every new cpu addition, a weak get_<cpuname>_ops() function needs be - * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. - */ -#define INTEL_MID_OPS_INIT { \ - DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ - DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ - DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ -}; - #ifdef CONFIG_X86_INTEL_MID static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) @@ -136,20 +107,6 @@ enum intel_mid_timer_options { extern enum intel_mid_timer_options intel_mid_timer_options; -/* - * Penwell uses spread spectrum clock, so the freq number is not exactly - * the same as reported by MSR based on SDM. - */ -#define FSB_FREQ_83SKU 83200 -#define FSB_FREQ_100SKU 99840 -#define FSB_FREQ_133SKU 133000 - -#define FSB_FREQ_167SKU 167000 -#define FSB_FREQ_200SKU 200000 -#define FSB_FREQ_267SKU 267000 -#define FSB_FREQ_333SKU 333000 -#define FSB_FREQ_400SKU 400000 - /* Bus Select SoC Fuse value */ #define BSEL_SOC_FUSE_MASK 0x7 /* FSB 133MHz */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 62a9f4966b42..ae26df1c2789 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -8,6 +8,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +#define MAX_FIXED_PEBS_EVENTS 3 /* * A debug store configuration. @@ -23,7 +24,7 @@ struct debug_store { u64 pebs_index; u64 pebs_absolute_maximum; u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; + u64 pebs_event_reset[MAX_PEBS_EVENTS + MAX_FIXED_PEBS_EVENTS]; } __aligned(PAGE_SIZE); DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 89f08955fff7..c14f2a74b2be 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -13,7 +13,9 @@ * Interrupt control: */ -static inline unsigned long native_save_fl(void) +/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ +extern inline unsigned long native_save_fl(void); +extern inline unsigned long native_save_fl(void) { unsigned long flags; diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 367d99cff426..c8cec1b39b88 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -78,7 +78,7 @@ struct arch_specific_insn { * boostable = true: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's - * a post_handler or break_handler). + * a post_handler). */ bool boostable; bool if_modifier; @@ -111,9 +111,6 @@ struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_old_flags; unsigned long kprobe_saved_flags; - unsigned long *jprobe_saved_sp; - struct pt_regs jprobe_saved_regs; - kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; struct prev_kprobe prev_kprobe; }; diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index 46185263d9c2..000000000000 --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..00ddb0c9e612 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -17,6 +17,7 @@ #include <linux/tracepoint.h> #include <linux/cpumask.h> #include <linux/irq_work.h> +#include <linux/irq.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -54,6 +55,7 @@ #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) @@ -75,13 +77,13 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -325,6 +327,16 @@ struct rsvd_bits_validate { u64 bad_mt_xwr; }; +struct kvm_mmu_root_info { + gpa_t cr3; + hpa_t hpa; +}; + +#define KVM_MMU_ROOT_INFO_INVALID \ + ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + +#define KVM_MMU_NUM_PREV_ROOTS 3 + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -344,7 +356,7 @@ struct kvm_mmu { struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; @@ -353,6 +365,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault @@ -713,6 +726,9 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; + + /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ + bool l1tf_flush_l1d; }; struct kvm_lpage_info { @@ -881,6 +897,7 @@ struct kvm_vcpu_stat { u64 signal_exits; u64 irq_window_exits; u64 nmi_window_exits; + u64 l1d_flush; u64 halt_exits; u64 halt_successful_poll; u64 halt_attempted_poll; @@ -973,6 +990,15 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + int (*tlb_remote_flush)(struct kvm *kvm); + + /* + * Flush any TLB entries associated with the given GVA. + * Does not need to flush GPA->HPA mappings. + * Can potentially get non-canonical addresses through INVLPGs, which + * the implementation may choose to ignore if appropriate. + */ + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); @@ -1085,6 +1111,14 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*get_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); @@ -1117,6 +1151,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) return kvm_x86_ops->vm_free(kvm); } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + if (kvm_x86_ops->tlb_remote_flush && + !kvm_x86_ops->tlb_remote_flush(kvm)) + return 0; + else + return -ENOTSUPP; +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -1268,6 +1312,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (~0UL) + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); @@ -1279,7 +1327,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1298,7 +1346,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); void kvm_enable_tdp(void); void kvm_disable_tdp(void); @@ -1413,6 +1462,11 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit); + +u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3aea2658323a..4c723632c036 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,7 +7,6 @@ #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index bbc796eb0a3b..eeeb9289c764 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -71,12 +71,7 @@ struct ldt_struct { static inline void *ldt_slot_va(int slot) { -#ifdef CONFIG_X86_64 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); -#else - BUG(); - return (void *)fix_to_virt(FIX_HOLE); -#endif } /* diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 3cd14311edfa..f37704497d8f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -9,6 +9,8 @@ #include <asm/hyperv-tlfs.h> #include <asm/nospec-branch.h> +#define VP_INVAL U32_MAX + struct ms_hyperv_info { u32 features; u32 misc_features; @@ -20,7 +22,6 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; - /* * Generate the guest ID. */ @@ -75,8 +76,10 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } -#define hv_init_timer(timer, tick) wrmsrl(timer, tick) -#define hv_init_timer_config(config, val) wrmsrl(config, val) +#define hv_init_timer(timer, tick) \ + wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick) +#define hv_init_timer_config(timer, val) \ + wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val) #define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val) #define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val) @@ -89,8 +92,13 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index) -#define hv_get_synint_state(int_num, val) rdmsrl(int_num, val) -#define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) +#define hv_get_synint_state(int_num, val) \ + rdmsrl(HV_X64_MSR_SINT0 + int_num, val) +#define hv_set_synint_state(int_num, val) \ + wrmsrl(HV_X64_MSR_SINT0 + int_num, val) + +#define hv_get_crash_ctl(val) \ + rdmsrl(HV_X64_MSR_CRASH_CTL, val) void hyperv_callback_vector(void); void hyperv_reenlightenment_vector(void); @@ -193,6 +201,40 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) return hv_status; } +/* Fast hypercall with 16 bytes of input */ +static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("mov %4, %%r8\n" + CALL_NOSPEC + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : "r" (input2), + THUNK_TARGET(hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + u32 input2_hi = upper_32_bits(input2); + u32 input2_lo = lower_32_bits(input2); + + __asm__ __volatile__ (CALL_NOSPEC + : "=A"(hv_status), + "+c"(input1_lo), ASM_CALL_CONSTRAINT + : "A" (control), "b" (input1_hi), + "D"(input2_hi), "S"(input2_lo), + THUNK_TARGET(hv_hypercall_pg) + : "cc"); + } +#endif + return hv_status; +} + /* * Rep hypercalls. Callers of this functions are supposed to ensure that * rep_count and varhead_size comply with Hyper-V hypercall definition. @@ -281,6 +323,8 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset, */ for_each_cpu(cpu, cpus) { vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu == VP_INVAL) + return -1; vcpu_bank = vcpu / 64; vcpu_offset = vcpu % 64; __set_bit(vcpu_offset, (unsigned long *) @@ -295,6 +339,7 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset, void __init hyperv_init(void); void hyperv_setup_mmu_ops(void); void hyperv_report_panic(struct pt_regs *regs, long err); +void hyperv_report_panic_msg(phys_addr_t pa, size_t size); bool hv_is_hyperv_initialized(void); void hyperv_cleanup(void); @@ -302,6 +347,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); +int hyperv_flush_guest_mapping(u64 as); #ifdef CONFIG_X86_64 void hv_apic_init(void); @@ -321,6 +367,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) { return NULL; } +static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 68b2c3150de1..4731f0cf97c5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -70,12 +70,19 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO (1 << 4) /* * Not susceptible to Speculative Store Bypass * attack, so no Speculative Store Bypass * control required. */ +#define MSR_IA32_FLUSH_CMD 0x0000010b +#define L1D_FLUSH (1 << 0) /* + * Writeback and invalidate the + * L1 data cache. + */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f6f6c63da62f..fd2a8c1b88bc 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -214,7 +214,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS, + SPECTRE_V2_IBRS_ENHANCED, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 9c9dc579bd7d..46f516dd80ce 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -88,6 +88,7 @@ struct orc_entry { unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; + unsigned end:1; } __packed; /* @@ -101,6 +102,7 @@ struct unwind_hint { s16 sp_offset; u8 sp_reg; u8 type; + u8 end; }; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index aa30c3241ea7..0d5c739eebd7 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -29,8 +29,13 @@ #define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE -/* 44=32+12, the limit we can fit into an unsigned long pfn */ -#define __PHYSICAL_MASK_SHIFT 44 +/* + * This is beyond the 44 bit limit imposed by the 32bit long pfns, + * but we need the full mask to make sure inverted PROT_NONE + * entries have all the host bits set in a guest. + * The real limit is still 44 bits. + */ +#define __PHYSICAL_MASK_SHIFT 52 #define __VIRTUAL_MASK_SHIFT 32 #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h index e1084f71a295..94597a3cf3d0 100644 --- a/arch/x86/include/asm/pci-direct.h +++ b/arch/x86/include/asm/pci-direct.h @@ -15,8 +15,4 @@ extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val); extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val); extern int early_pci_allowed(void); - -extern unsigned int pci_early_dump_regs; -extern void early_dump_pci_device(u8 bus, u8 slot, u8 func); -extern void early_dump_pci_devices(void); #endif /* _ASM_X86_PCI_DIRECT_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a06b07399d17..e9202a0de8f0 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -450,9 +450,10 @@ do { \ bool __ret; \ typeof(pcp1) __o1 = (o1), __n1 = (n1); \ typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ - : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \ - : "b" (__n1), "c" (__n2), "a" (__o1)); \ + asm volatile("cmpxchg8b "__percpu_arg(1) \ + CC_SET(z) \ + : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \ + : "b" (__n1), "c" (__n2)); \ __ret; \ }) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 685ffe8a0eaf..24c6cf5f16b7 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd); +#endif *pmdp = pmd; } @@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0)); +#endif return __pmd(xchg((pmdval_t *)xp, 0)); } #else @@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #ifdef CONFIG_SMP static inline pud_t native_pudp_get_and_clear(pud_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0)); +#endif return __pud(xchg((pudval_t *)xp, 0)); } #else @@ -95,4 +104,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* No inverted PFNs on 2 level page tables */ + +static inline u64 protnone_mask(u64 val) +{ + return 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + return val; +} + +static inline bool __pte_needs_invert(u64 val) +{ + return false; +} + #endif /* _ASM_X86_PGTABLE_2LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index f982ef808e7e..6deb6cd236e3 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -35,4 +35,7 @@ typedef union { #define PTRS_PER_PTE 1024 +/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */ +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) + #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f24df59c40b2..a564084c6141 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); +#endif set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); } @@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) { union split_pud res, *orig = (union split_pud *)pudp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); +#endif + /* xchg acts as a barrier before setting of the high bits */ res.pud_low = xchg(&orig->pud_low, 0); res.pud_high = orig->pud_high; @@ -241,12 +248,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #endif /* Encode and de-code a swap entry */ +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) -#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) -#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +/* + * Normally, __swp_entry() converts from arch-independent swp_entry_t to + * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result + * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the + * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to + * __swp_entry_to_pte() through the following helper macro based on 64bit + * __swp_entry(). + */ +#define __swp_pteval_entry(type, offset) ((pteval_t) { \ + (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) + +#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ + __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) +/* + * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent + * swp_entry_t, but also has to convert it from 64bit to the 32bit + * intermediate representation, using the following macros based on 64bit + * __swp_type() and __swp_offset(). + */ +#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) +#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) + +#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ + __pteval_swp_offset(pte))) #define gup_get_pte gup_get_pte /* @@ -295,4 +333,6 @@ static inline pte_t gup_get_pte(pte_t *ptep) return pte; } +#include <asm/pgtable-invert.h> + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 6a59a6d0cc50..858358a82b14 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -21,9 +21,10 @@ typedef union { #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_PARAVIRT -#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) +#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \ + (pv_info.shared_kernel_pmd))) #else -#define SHARED_KERNEL_PMD 1 +#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif /* @@ -45,5 +46,6 @@ typedef union { #define PTRS_PER_PTE 512 #define MAX_POSSIBLE_PHYSMEM_BITS 36 +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h new file mode 100644 index 000000000000..a0c1525f1b6f --- /dev/null +++ b/arch/x86/include/asm/pgtable-invert.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PGTABLE_INVERT_H +#define _ASM_PGTABLE_INVERT_H 1 + +#ifndef __ASSEMBLY__ + +/* + * A clear pte value is special, and doesn't get inverted. + * + * Note that even users that only pass a pgprot_t (rather + * than a full pte) won't trigger the special zero case, + * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED + * set. So the all zero case really is limited to just the + * cleared page table entry case. + */ +static inline bool __pte_needs_invert(u64 val) +{ + return val && !(val & _PAGE_PRESENT); +} + +/* Get a mask to xor with the page table entry to get the correct pfn. */ +static inline u64 protnone_mask(u64 val) +{ + return __pte_needs_invert(val) ? ~0ull : 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + /* + * When a PTE transitions from NONE to !NONE or vice-versa + * invert the PFN part to stop speculation. + * pte_pfn undoes this when needed. + */ + if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) + val = (val & ~mask) | (~val & mask); + return val; +} + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5715647fc4fe..e4ffa565a69f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); +void ptdump_walk_user_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) +#define debug_checkwx() do { } while (0) +#define debug_checkwx_user() do { } while (0) #endif /* @@ -185,19 +188,29 @@ static inline int pte_special(pte_t pte) return pte_flags(pte) & _PAGE_SPECIAL; } +/* Entries that were set to PROT_NONE are inverted */ + +static inline u64 protnone_mask(u64 val); + static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + phys_addr_t pfn = pte_val(pte); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; + phys_addr_t pfn = pmd_val(pmd); + pfn ^= protnone_mask(pfn); + return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; + phys_addr_t pfn = pud_val(pud); + pfn ^= protnone_mask(pfn); + return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } static inline unsigned long p4d_pfn(p4d_t p4d) @@ -400,11 +413,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); -} - static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); @@ -459,11 +467,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud_set_flags(pud, _PAGE_RW); } -static inline pud_t pud_mknotpresent(pud_t pud) -{ - return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE); -} - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -545,25 +548,45 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pte(pfn | check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PMD_PAGE_MASK; + return __pmd(pfn | check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { - return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PUD_PAGE_MASK; + return __pud(pfn | check_pgprot(pgprot)); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pfn_pmd(pmd_pfn(pmd), + __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pteval_t val = pte_val(pte); + pteval_t val = pte_val(pte), oldval = val; /* * Chop off the NX bit (if present), and add the NX portion of @@ -571,17 +594,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); return __pte(val); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmdval_t val = pmd_val(pmd); + pmdval_t val = pmd_val(pmd), oldval = val; val &= _HPAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); return __pmd(val); } @@ -640,8 +663,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgtbl(pgdp, pgd); +} +#else /* CONFIG_PAGE_TABLE_ISOLATION */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + #endif /* __ASSEMBLY__ */ + #ifdef CONFIG_X86_32 # include <asm/pgtable_32.h> #else @@ -1154,6 +1200,70 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); +} + +static inline int pgd_large(pgd_t pgd) { return 0; } + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1320,6 +1430,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return __pte_access_permitted(pud_val(pud), write); } +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); + +static inline bool arch_has_pfn_modify_check(void) +{ + return boot_cpu_has_bug(X86_BUG_L1TF); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 88a056b01db4..b3ec519e3982 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); -static inline int pgd_large(pgd_t pgd) { return 0; } - /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index d9a001a4a872..b0bc0fff5f1f 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -50,13 +50,18 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ & PMD_MASK) -#define PKMAP_BASE \ +#define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE) + +#define PKMAP_BASE \ + ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) +# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 3c5385f9a88f..f773d5e6c8cc 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -132,90 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } -#ifdef CONFIG_PAGE_TABLE_ISOLATION -/* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages - * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and - * the user one is in the last 4k. To switch between them, you - * just need to flip the 12th bit in their addresses. - */ -#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT - -/* - * This generates better code than the inline assembly in - * __set_bit(). - */ -static inline void *ptr_set_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr |= BIT(bit); - return (void *)__ptr; -} -static inline void *ptr_clear_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr &= ~BIT(bit); - return (void *)__ptr; -} - -static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) -{ - return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) -{ - return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) -{ - return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) -{ - return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ - -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * - * Returns true for parts of the PGD that map userspace and - * false for the parts that map the kernel. - */ -static inline bool pgdp_maps_userspace(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); -} - -#ifdef CONFIG_PAGE_TABLE_ISOLATION -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); - -/* - * Take a PGD location (pgdp) and a pgd value that needs to be set there. - * Populates the user and returns the resulting PGD that must be set in - * the kernel copy of the page tables. - */ -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -{ - if (!static_cpu_has(X86_FEATURE_PTI)) - return pgd; - return __pti_set_user_pgd(pgdp, pgd); -} -#else -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -{ - return pgd; -} -#endif - static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; @@ -226,7 +142,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) } pgd = native_make_pgd(native_p4d_val(p4d)); - pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); *p4dp = native_make_p4d(native_pgd_val(pgd)); } @@ -237,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pti_set_user_pgd(pgdp, pgd); + *pgdp = pti_set_user_pgtbl(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) @@ -255,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Level 4 access. */ -static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ @@ -273,7 +188,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -286,20 +201,34 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. + * + * The offset is inverted by a binary not operation to make the high + * physical bits set. */ -#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) -#define SWP_TYPE_BITS 5 -/* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ - & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (SWP_TYPE_FIRST_BIT)) \ - | ((offset) << SWP_OFFSET_FIRST_BIT) }) +/* Extract the high bits for type */ +#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) + +/* Shift up (to get rid of type), then down to get value */ +#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) + +/* + * Shift the offset up "too far" by TYPE bits, then down again + * The offset is inverted by a binary not operation to make the high + * physical bits set. + */ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) @@ -343,5 +272,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages, return true; } +#include <asm/pgtable-invert.h> + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 054765ab2da2..04edd2d58211 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY_L5 -112UL #define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) #define __VMALLOC_BASE_L4 0xffffc90000000000UL #define __VMALLOC_BASE_L5 0xffa0000000000000UL @@ -153,4 +154,6 @@ extern unsigned int ptrs_per_p4d; #define EARLY_DYNAMIC_PAGE_TABLES 64 +#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 99fff853c944..b64acb08a62b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -50,6 +50,7 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +#ifdef CONFIG_X86_PAE + +/* + * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't + * use it here. + */ + +#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) +#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) + +/* + * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. + * All other bits are Reserved MBZ + */ +#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ + _PAGE_PWT | _PAGE_PCD | \ + _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) + +#else +/* No need to mask any bits for !PAE */ +#define PGD_ALLOWED_BITS (~0ULL) +#endif + static inline pgd_t native_make_pgd(pgdval_t val) { - return (pgd_t) { val }; + return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { - return pgd.pgd; + return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 625a52a5594f..02c2cbda4a74 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -39,10 +39,6 @@ #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define X86_CR3_PTI_PCID_USER_BIT 11 -#endif - #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save @@ -53,4 +49,8 @@ #define CR3_NOFLUSH 0 #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_PCID_USER_BIT 11 +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cfd29ee8c3da..682286aca881 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op; extern void cpu_detect(struct cpuinfo_x86 *c); +static inline unsigned long l1tf_pfn_limit(void) +{ + return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; +} + extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -966,6 +971,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); +extern void free_kernel_image_pages(void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN @@ -977,4 +983,16 @@ bool xen_set_default_idle(void); void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); void microcode_check(void); + +enum l1tf_mitigations { + L1TF_MITIGATION_OFF, + L1TF_MITIGATION_FLUSH_NOWARN, + L1TF_MITIGATION_FLUSH, + L1TF_MITIGATION_FLUSH_NOSMT, + L1TF_MITIGATION_FULL, + L1TF_MITIGATION_FULL_FORCE +}; + +extern enum l1tf_mitigations l1tf_mitigation; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 38a17f1d5c9d..5df09a0b80b8 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -6,10 +6,9 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); -extern void pti_clone_kernel_text(void); +extern void pti_finalize(void); #else static inline void pti_check_boottime_disable(void) { } -static inline void pti_clone_kernel_text(void) { } #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 9ef5ee03d2d7..159622ee0674 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -43,7 +43,7 @@ asm (".pushsection .text;" "push %rdx;" "mov $0x1,%eax;" "xor %edx,%edx;" - "lock cmpxchg %dl,(%rdi);" + LOCK_PREFIX "cmpxchg %dl,(%rdi);" "cmp $0x1,%al;" "jne .slowpath;" "pop %rdx;" diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 4cf11d88d3b3..19b90521954c 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -5,6 +5,7 @@ * PaX/grsecurity. */ #include <linux/refcount.h> +#include <asm/bug.h> /* * This is the first portion of the refcount error handling, which lives in diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..4a911a382ade 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,6 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; +extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index bd090367236c..34cffcef7375 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index eb5f7999a893..36bd243843d6 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -87,15 +87,25 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) #endif /* This is used when switching tasks or entering/exiting vm86 mode. */ -static inline void update_sp0(struct task_struct *task) +static inline void update_task_stack(struct task_struct *task) { - /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task->thread.sp0); + else + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else + /* + * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That + * doesn't work on x86-32 because sp1 and + * cpu_current_top_of_stack have different values (because of + * the non-zero stack-padding on 32bit). + */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif + } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 2ecd34e2d46c..e85ff65c43c3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -37,5 +37,6 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern int after_bootmem; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6690cd3fc8b1..511bf5fae8b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* - * If we have PCID, then switching to init_mm is reasonably - * fast. If we don't have PCID, then switching to init_mm is - * quite slow, so we try to defer it in the hopes that we can - * avoid it entirely. The latter approach runs the risk of - * receiving otherwise unnecessary IPIs. - * - * This choice is just a heuristic. The tlb code can handle this - * function returning true or false regardless of whether we have - * PCID. - */ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); native_flush_tlb_others(mask, info) #endif +extern void tlb_flush_remove_tables(struct mm_struct *mm); +extern void tlb_flush_remove_tables_local(void *arg); + +#define HAVE_TLB_FLUSH_REMOVE_TABLES + #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c1d2a9892352..453cf38a1c33 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void) } int topology_update_package_map(unsigned int apicid, unsigned int cpu); -extern int topology_phys_to_logical_pkg(unsigned int pkg); +int topology_phys_to_logical_pkg(unsigned int pkg); +bool topology_is_primary_thread(unsigned int cpu); +bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } +static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline bool topology_smt_supported(void) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index 4253bca99989..2e6245a023ef 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -28,6 +28,35 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others, __entry->addr, __entry->end) ); +TRACE_EVENT(hyperv_nested_flush_guest_mapping, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + +TRACE_EVENT(hyperv_send_ipi_mask, + TP_PROTO(const struct cpumask *cpus, + int vector), + TP_ARGS(cpus, vector), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(int, vector) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->vector = vector; + ), + TP_printk("ncpus %d vector %x", + __entry->ncpus, __entry->vector) + ); + #endif /* CONFIG_HYPERV */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 2701d221583a..eb5bbfeccb66 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -33,13 +33,13 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -extern void tsc_early_delay_calibrate(void); +extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); -extern unsigned long native_calibrate_cpu(void); +extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 62acb613114b..a9d637bc301d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -52,7 +52,12 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len) unsigned long ret; __uaccess_begin(); - ret = memcpy_mcsafe(to, from, len); + /* + * Note, __memcpy_mcsafe() is explicitly used since it can + * handle exceptions / faults. memcpy_mcsafe() may fall back to + * memcpy() which lacks this handling. + */ + ret = __memcpy_mcsafe(to, from, len); __uaccess_end(); return ret; } diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index bae46fc6b9de..0bcdb1279361 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -26,7 +26,7 @@ * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0 #ifdef CONFIG_STACK_VALIDATION .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints @@ -35,12 +35,14 @@ .short \sp_offset .byte \sp_reg .byte \type + .byte \end + .balign 4 .popsection #endif .endm .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1 .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 @@ -86,19 +88,21 @@ #else /* !__ASSEMBLY__ */ -#define UNWIND_HINT(sp_reg, sp_offset, type) \ +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ ".byte " __stringify(sp_reg) "\n\t" \ ".byte " __stringify(type) "\n\t" \ + ".byte " __stringify(end) "\n\t" \ + ".balign 4 \n\t" \ ".popsection\n\t" -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6aa8499e1f62..95f9107449bf 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -576,4 +576,15 @@ enum vm_instruction_error_number { VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, }; +enum vmx_l1d_flush_state { + VMENTER_L1D_FLUSH_AUTO, + VMENTER_L1D_FLUSH_NEVER, + VMENTER_L1D_FLUSH_COND, + VMENTER_L1D_FLUSH_ALWAYS, + VMENTER_L1D_FLUSH_EPT_DISABLED, + VMENTER_L1D_FLUSH_NOT_REQUIRED, +}; + +extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; + #endif diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index bfd882617613..6b2f90a0b149 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -209,24 +209,37 @@ extern struct { char _entry[32]; } hypercall_page[]; }) static inline long -privcmd_call(unsigned call, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4, - unsigned long a5) +xen_single_call(unsigned int call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) { __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - stac(); asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); - clac(); return (long)__res; } +static inline long +privcmd_call(unsigned int call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) +{ + long res; + + stac(); + res = xen_single_call(call, a1, a2, a3, a4, a5); + clac(); + + return res; +} + static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 0ede697c3961..19980ec1a316 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 +#define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..8824d01c0c35 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,6 +61,7 @@ obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-y += irqflags.o obj-y += process.o obj-y += fpu/ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2aabd4cb0e3f..84132eddb5a8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -56,6 +56,7 @@ #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/irq_regs.h> unsigned int num_processors; @@ -573,6 +574,9 @@ static u32 skx_deadline_rev(void) case 0x04: return 0x02000014; } + if (boot_cpu_data.x86_stepping > 4) + return 0; + return ~0U; } @@ -937,7 +941,7 @@ static int __init calibrate_APIC_clock(void) if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); - return -1; + return -1; } return 0; @@ -2189,6 +2193,23 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +#ifdef CONFIG_SMP +/** + * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread + * @id: APIC ID to check + */ +bool apic_id_is_primary_thread(unsigned int apicid) +{ + u32 mask; + + if (smp_num_siblings == 1) + return true; + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + return !(apicid & mask); +} +#endif + /* * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids * and cpuid_to_apicid[] synchronized. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3982f79d2377..ff0d14cd9e82 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -33,6 +33,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ce503c99f5c4..72a94401f9e0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -12,6 +12,7 @@ */ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/pci.h> #include <linux/dmar.h> #include <linux/hpet.h> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 35aaee4fc028..9f148e3d45b4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compiler.h> @@ -218,7 +219,8 @@ static int reserve_irq_vector(struct irq_data *irqd) return 0; } -static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); bool resvd = apicd->has_reserved; @@ -245,22 +247,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) return -EBUSY; vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); - if (vector > 0) - apic_update_vector(irqd, vector, cpu); trace_vector_alloc(irqd->irq, vector, resvd, vector); - return vector; -} - -static int assign_vector_locked(struct irq_data *irqd, - const struct cpumask *dest) -{ - struct apic_chip_data *apicd = apic_chip_data(irqd); - int vector = allocate_vector(irqd, dest); - if (vector < 0) return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); - apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } @@ -433,7 +425,7 @@ static int activate_managed(struct irq_data *irqd) pr_err("Managed startup irq %u, no vector available\n", irqd->irq); } - return ret; + return ret; } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d492752f79e1..391f358ebb4c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -394,10 +394,10 @@ extern int uv_hub_info_version(void) EXPORT_SYMBOL(uv_hub_info_version); /* Default UV memory block size is 2GB */ -static unsigned long mem_block_size = (2UL << 30); +static unsigned long mem_block_size __initdata = (2UL << 30); /* Kernel parameter to specify UV mem block size */ -static int parse_mem_block_size(char *ptr) +static int __init parse_mem_block_size(char *ptr) { unsigned long size = memparse(ptr, NULL); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d0de79fdab0..ec00d1ff5098 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -240,6 +240,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/nospec-branch.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -614,11 +615,13 @@ static long __apm_bios_call(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); @@ -690,10 +693,12 @@ static long __apm_bios_call_simple(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index dcb008c320fe..01de31db300d 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,4 +103,9 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); + + /* Offset for sp0 and sp1 into the tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a4a3be399f4b..82826f2275cc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,8 +46,14 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b2dcd161f514..3b9405e7ba2b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -65,8 +65,6 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); - OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); - OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7a40196967cb..347137e80bf5 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -35,7 +35,9 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o +CFLAGS_intel_rdt_pseudo_lock.o = -I$(src) obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 082d7875cef8..22ab408177b2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -315,6 +313,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) c->cpu_core_id %= cus_per_node; } + +static void amd_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + /* * Fixup core topology information for * (1) AMD multi-node processors @@ -334,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); node_id = ecx & 0xff; - smp_num_siblings = ((ebx >> 8) & 0xff) + 1; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -543,7 +547,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) nodes_per_socket = ((value >> 3) & 7) + 1; } - if (c->x86 >= 0x15 && c->x86 <= 0x17) { + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && + c->x86 >= 0x15 && c->x86 <= 0x17) { unsigned int bit; switch (c->x86) { @@ -611,10 +617,19 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { + u64 value; u32 dummy; early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -681,6 +696,22 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_E400); early_detect_mem_encrypt(c); + + /* Re-enable TopologyExtensions if switched off by BIOS */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + amd_get_topology_early(c); } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -772,19 +803,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; - /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - /* * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. @@ -848,22 +866,12 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) { - amd_detect_cmp(c); - amd_get_topology(c); - srat_detect_node(c); - } - -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif + amd_detect_cmp(c); + amd_get_topology(c); + srat_detect_node(c); init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 404df26b7de8..cb4a16292aa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,15 +22,18 @@ #include <asm/processor-flags.h> #include <asm/fpu/internal.h> #include <asm/msr.h> +#include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> +#include <asm/e820/api.h> #include <asm/hypervisor.h> static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); /* * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any @@ -56,6 +59,12 @@ void __init check_bugs(void) { identify_boot_cpu(); + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology_early(); + if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); print_cpu_info(&boot_cpu_data); @@ -82,6 +91,8 @@ void __init check_bugs(void) */ ssb_select_mitigation(); + l1tf_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -130,6 +141,7 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt @@ -155,7 +167,8 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) hostval |= ssbd_tif_to_spec_ctrl(ti->flags); if (hostval != guestval) { @@ -312,23 +325,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -/* Check for Skylake-like CPUs (for RSB handling) */ -static bool __init is_skylake_era(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - return true; - } - } - return false; -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -348,6 +344,13 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_IBRS_ENHANCED; + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + goto specv2_set_mode; + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -385,26 +388,20 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } +specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP nor PTI are available, there is a risk of - * hitting userspace addresses in the RSB after a context switch - * from a shallow call stack to a deeper one. To prevent this fill - * the entire RSB, even when using IBRS. + * If spectre v2 protection has been enabled, unconditionally fill + * RSB during a context switch; this protects against two independent + * issues: * - * Skylake era CPUs have a separate issue with *underflow* of the - * RSB, when they will predict 'ret' targets from the generic BTB. - * The proper mitigation for this is IBRS. If IBRS is not supported - * or deactivated in favour of retpolines the RSB fill on context - * switch is required. + * - RSB underflow (and switch to BTB) on Skylake+ + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs */ - if ((!boot_cpu_has(X86_FEATURE_PTI) && - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); - } + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { @@ -414,9 +411,16 @@ retpoline_auto: /* * Retpoline means the kernel is safe because it has no indirect - * branches. But firmware isn't, so use IBRS to protect that. + * branches. Enhanced IBRS protects firmware too, so, enable restricted + * speculation around firmware calls only when Enhanced IBRS isn't + * supported. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } @@ -533,9 +537,10 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ - if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { x86_amd_ssb_disable(); - else { + } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); @@ -652,8 +657,120 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +#if IS_ENABLED(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(l1tf_mitigation); +#endif +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); + +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + break; + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + cpu_smt_disable(false); + break; + case L1TF_MITIGATION_FULL_FORCE: + cpu_smt_disable(true); + break; + } + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + /* + * This is extremely unlikely to happen because almost all + * systems have far more MAX_PA/2 than RAM can be fit into + * DIMM slots. + */ + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + +static int __init l1tf_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (!strcmp(str, "flush,nowarn")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; + else if (!strcmp(str, "flush")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + else if (!strcmp(str, "flush,nosmt")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else if (!strcmp(str, "full")) + l1tf_mitigation = L1TF_MITIGATION_FULL; + else if (!strcmp(str, "full,force")) + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; + + return 0; +} +early_param("l1tf", l1tf_cmdline); + +#undef pr_fmt + #ifdef CONFIG_SYSFS +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static const char *l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" +}; + +static ssize_t l1tf_show_state(char *buf) +{ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && + cpu_smt_control == CPU_SMT_ENABLED)) + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); + + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); +} +#else +static ssize_t l1tf_show_state(char *buf) +{ + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); +} +#endif + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -682,6 +799,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; default: break; } @@ -708,4 +829,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eb4cb3efd20e..84dee5ab745a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -661,33 +661,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -void detect_ht(struct cpuinfo_x86 *c) +int detect_ht_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) - return; + return -1; if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; + return -1; if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return; + return -1; cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { + if (smp_num_siblings == 1) pr_info_once("CPU0: Hyper-Threading is disabled\n"); - goto out; - } +#endif + return 0; +} - if (smp_num_siblings <= 1) - goto out; +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int index_msb, core_bits; + + if (detect_ht_early(c) < 0) + return; index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); @@ -700,15 +703,6 @@ void detect_ht(struct cpuinfo_x86 *c) c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); - -out: - if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif } @@ -911,7 +905,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) apply_forced_caps(c); } -static void get_cpu_address_sizes(struct cpuinfo_x86 *c) +void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -987,6 +981,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { {} }; +static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { + /* in addition to cpu_no_speculation */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + {} +}; + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; @@ -1005,6 +1014,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (x86_match_cpu(cpu_no_meltdown)) return; @@ -1013,6 +1025,29 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (x86_match_cpu(cpu_no_l1tf)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); +} + +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif } /* @@ -1089,6 +1124,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); } void __init early_cpu_init(void) @@ -1124,24 +1161,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1204,8 +1223,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* @@ -1804,11 +1821,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, curr); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 38216f678fc3..7b229afa0a37 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -46,6 +46,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; extern void get_cpu_cap(struct cpuinfo_x86 *c); +extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern u32 get_scattered_cpuid_leaf(unsigned int level, @@ -55,7 +56,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index eb75564f2d25..401e8c133108 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + + /* + * Get the number of SMT siblings early from the extended topology + * leaf, if available. Otherwise try the legacy SMT detection. + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); } #ifdef CONFIG_X86_32 @@ -465,14 +472,17 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) #define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 #define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 #define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 +#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + u32 msr_vpid_cap, msr_ept_cap; clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); clear_cpu_cap(c, X86_FEATURE_VNMI); clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); clear_cpu_cap(c, X86_FEATURE_EPT); clear_cpu_cap(c, X86_FEATURE_VPID); + clear_cpu_cap(c, X86_FEATURE_EPT_AD); rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); msr_ctl = vmx_msr_high | vmx_msr_low; @@ -487,8 +497,13 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) { set_cpu_cap(c, X86_FEATURE_EPT); + rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, + msr_ept_cap, msr_vpid_cap); + if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD) + set_cpu_cap(c, X86_FEATURE_EPT_AD); + } if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) set_cpu_cap(c, X86_FEATURE_VPID); } diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ec4754f81cbd..abb71ac70443 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void) return (rdt_mon_capable || rdt_alloc_capable); } +static enum cpuhp_state rdt_online; + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void) cpuhp_remove_state(state); return ret; } + rdt_online = state; for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); @@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void) } late_initcall(intel_rdt_late_init); + +static void __exit intel_rdt_exit(void) +{ + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); +} + +__exitcall(intel_rdt_exit); diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 39752825e376..4e588f36228f 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -81,6 +81,34 @@ enum rdt_group_type { }; /** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** * struct mongroup - store mon group's data in resctrl fs. * @mon_data_kn kernlfs node for the mon_data directory * @parent: parent rdtgrp @@ -95,6 +123,43 @@ struct mongroup { }; /** + * struct pseudo_lock_region - pseudo-lock region information + * @r: RDT resource to which this pseudo-locked region + * belongs + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region + */ +struct pseudo_lock_region { + struct rdt_resource *r; + struct rdt_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; +}; + +/** * struct rdtgroup - store rdtgroup's data in resctrl file system. * @kn: kernfs node * @rdtgroup_list: linked list for all rdtgroups @@ -106,16 +171,20 @@ struct mongroup { * @type: indicates type of this rdtgroup - either * monitor only or ctrl_mon group * @mon: mongroup related data + * @mode: mode of resource group + * @plr: pseudo-locked region */ struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + struct pseudo_lock_region *plr; }; /* rdtgroup.flags */ @@ -148,6 +217,7 @@ extern struct list_head rdt_all_groups; extern int max_name_width, max_data_width; int __init rdtgroup_init(void); +void __exit rdtgroup_exit(void); /** * struct rftype - describe each file in the resctrl file system @@ -216,22 +286,24 @@ struct mbm_state { * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps * @new_ctrl: new ctrl value to be loaded * @have_new_ctrl: did user provide new_ctrl for this domain + * @plr: pseudo-locked region (if any) associated with domain */ struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; - struct delayed_work mbm_over; - struct delayed_work cqm_limbo; - int mbm_work_cpu; - int cqm_work_cpu; - u32 *ctrl_val; - u32 *mbps_val; - u32 new_ctrl; - bool have_new_ctrl; + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 *mbps_val; + u32 new_ctrl; + bool have_new_ctrl; + struct pseudo_lock_region *plr; }; /** @@ -351,7 +423,7 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + int (*parse_ctrlval) (void *data, struct rdt_resource *r, struct rdt_domain *d); struct list_head evt_list; int num_rmid; @@ -359,8 +431,8 @@ struct rdt_resource { unsigned long fflags; }; -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -368,7 +440,7 @@ extern struct rdt_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); -int __init rdtgroup_init(void); +extern struct dentry *debugfs_resctrl; enum { RDT_RESOURCE_L3, @@ -439,13 +511,32 @@ void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + u32 _cbm, int closid, bool exclusive); +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, + u32 cbm); +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); +int rdtgroup_tasks_assigned(struct rdtgroup *r); +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +int rdt_pseudo_lock_init(void); +void rdt_pseudo_lock_release(void); +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int update_domains(struct rdt_resource *r, int closid); +void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 116d57b248d3..af358ca05160 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; + char *buf = _buf; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); @@ -87,7 +88,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -122,22 +123,64 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } +struct rdt_cbm_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) { - unsigned long data; + struct rdt_cbm_parse_data *data = _data; + struct rdtgroup *rdtgrp = data->rdtgrp; + u32 cbm_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if(!cbm_validate(buf, &data, r)) + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_printf("pseudo-locked region in hierarchy\n"); return -EINVAL; - d->new_ctrl = data; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_printf("overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_printf("overlaps with other group\n"); + return -EINVAL; + } + } + + d->new_ctrl = cbm_val; d->have_new_ctrl = true; return 0; @@ -149,8 +192,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) * separated by ";". The "id" is in decimal, and must match one of * the "id"s for this resource. */ -static int parse_line(char *line, struct rdt_resource *r) +static int parse_line(char *line, struct rdt_resource *r, + struct rdtgroup *rdtgrp) { + struct rdt_cbm_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; @@ -167,15 +212,32 @@ next: dom = strim(dom); list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - if (r->parse_ctrlval(dom, r, d)) + data.buf = dom; + data.rdtgrp = rdtgrp; + if (r->parse_ctrlval(&data, r, d)) return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->r = r; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = d->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } goto next; } } return -EINVAL; } -static int update_domains(struct rdt_resource *r, int closid) +int update_domains(struct rdt_resource *r, int closid) { struct msr_param msr_param; cpumask_var_t cpu_mask; @@ -220,13 +282,14 @@ done: return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) { struct rdt_resource *r; for_each_alloc_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && closid < r->num_closid) - return parse_line(tok, r); + if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid) + return parse_line(tok, r, rdtgrp); } rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); return -EINVAL; @@ -239,7 +302,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; - int closid, ret = 0; + int ret = 0; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -253,7 +316,15 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, } rdt_last_cmd_clear(); - closid = rdtgrp->closid; + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("resource group is pseudo-locked\n"); + goto out; + } for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) @@ -272,17 +343,27 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, ret = -EINVAL; goto out; } - ret = rdtgroup_parse_resource(resname, tok, closid); + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); if (ret) goto out; } for_each_alloc_enabled_rdt_resource(r) { - ret = update_domains(r, closid); + ret = update_domains(r, rdtgrp->closid); if (ret) goto out; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + out: rdtgroup_kn_unlock(of->kn); return ret ?: nbytes; @@ -318,10 +399,18 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - closid = rdtgrp->closid; - for_each_alloc_enabled_rdt_resource(r) { - if (closid < r->num_closid) - show_doms(s, r, closid); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + for_each_alloc_enabled_rdt_resource(r) + seq_printf(s, "%s:uninitialized\n", r->name); + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, + rdtgrp->plr->d->id, rdtgrp->plr->cbm); + } else { + closid = rdtgrp->closid; + for_each_alloc_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } } } else { ret = -ENOENT; diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c new file mode 100644 index 000000000000..40f3903ae5d9 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -0,0 +1,1522 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre <reinette.chatre@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cacheinfo.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/kthread.h> +#include <linux/mman.h> +#include <linux/pm_qos.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/intel-family.h> +#include <asm/intel_rdt_sched.h> +#include <asm/perf_event.h> + +#include "intel_rdt.h" + +#define CREATE_TRACE_POINTS +#include "intel_rdt_pseudo_lock_event.h" + +/* + * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware + * prefetcher state. Details about this register can be found in the MSR + * tables for specific platforms found in Intel's SDM. + */ +#define MSR_MISC_FEATURE_CONTROL 0x000001a4 + +/* + * The bits needed to disable hardware prefetching varies based on the + * platform. During initialization we will discover which bits to use. + */ +static u64 prefetch_disable_bits; + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); +static struct class *pseudo_lock_class; + +/** + * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * + * Capture the list of platforms that have been validated to support + * pseudo-locking. This includes testing to ensure pseudo-locked regions + * with low cache miss rates can be created under variety of load conditions + * as well as that these pseudo-locked regions can maintain their low cache + * miss rates under variety of load conditions for significant lengths of time. + * + * After a platform has been validated to support pseudo-locking its + * hardware prefetch disable bits are included here as they are documented + * in the SDM. + * + * When adding a platform here also add support for its cache events to + * measure_cycles_perf_fn() + * + * Return: + * If platform is supported, the bits to disable hardware prefetchers, 0 + * if platform is not supported. + */ +static u64 get_prefetch_disable_bits(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return 0; + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) + * 2 DCU Hardware Prefetcher Disable (R/W) + * 3 DCU IP Prefetcher Disable (R/W) + * 63:4 Reserved + */ + return 0xF; + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GEMINI_LAKE: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 Reserved + * 2 DCU Hardware Prefetcher Disable (R/W) + * 63:3 Reserved + */ + return 0x5; + } + + return 0; +} + +/* + * Helper to write 64bit value to MSR without tracing. Used when + * use of the cache should be restricted and use of registers used + * for local variables avoided. + */ +static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) +{ + __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); +} + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("fail allocating mem for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("fail to add latency req cpu%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->r = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + struct cpu_cacheinfo *ci; + int ret; + int i; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("cpu %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo(plr->cpu); + + plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == plr->r->cache_level) { + plr->line_size = ci->info_list[i].coherency_line_size; + return 0; + } + } + + ret = -1; + rdt_last_cmd_puts("unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * pseudo_lock_fn - Load kernel memory into cache + * @_rdtgrp: resource group to which pseudo-lock region belongs + * + * This is the core pseudo-locking flow. + * + * First we ensure that the kernel memory cannot be found in the cache. + * Then, while taking care that there will be as little interference as + * possible, the memory to be loaded is accessed while core is running + * with class of service set to the bitmask of the pseudo-locked region. + * After this is complete no future CAT allocations will be allowed to + * overlap with this bitmask. + * + * Local register variables are utilized to ensure that the memory region + * to be locked is the only memory access made during the critical locking + * loop. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int pseudo_lock_fn(void *_rdtgrp) +{ + struct rdtgroup *rdtgrp = _rdtgrp; + struct pseudo_lock_region *plr = rdtgrp->plr; + u32 rmid_p, closid_p; + unsigned long i; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer, but the cost + * is that this variable will enter the cache through evicting the + * memory we are trying to lock into the cache. Thus expect lower + * pseudo-locking success rate when KASAN is active. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + /* + * Make sure none of the allocated memory is cached. If it is we + * will get a cache hit in below loop from outside of pseudo-locked + * region. + * wbinvd (as opposed to clflush/clflushopt) is required to + * increase likelihood that allocated cache portion will be filled + * with associated memory. + */ + native_wbinvd(); + + /* + * Always called with interrupts enabled. By disabling interrupts + * ensure that we will not be preempted during this critical section. + */ + local_irq_disable(); + + /* + * Call wrmsr and rdmsr as directly as possible to avoid tracing + * clobbering local register variables or affecting cache accesses. + * + * Disable the hardware prefetcher so that when the end of the memory + * being pseudo-locked is reached the hardware will not read beyond + * the buffer and evict pseudo-locked memory read earlier from the + * cache. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + closid_p = this_cpu_read(pqr_state.cur_closid); + rmid_p = this_cpu_read(pqr_state.cur_rmid); + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + /* + * Critical section begin: start by writing the closid associated + * with the capacity bitmask of the cache region being + * pseudo-locked followed by reading of kernel memory to load it + * into the cache. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + /* + * Cache was flushed earlier. Now access kernel memory to read it + * into cache region associated with just activated plr->closid. + * Loop over data twice: + * - In first loop the cache region is shared with the page walker + * as it populates the paging structure caches (including TLB). + * - In the second loop the paging structure caches are used and + * cache region is populated with the memory being referenced. + */ + for (i = 0; i < size; i += PAGE_SIZE) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Critical section end: restore closid with capacity bitmask that + * does not overlap with pseudo-locked region. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + + /* Re-enable the hardware prefetcher(s) */ + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @r: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || + rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + prefetch_disable_bits = get_prefetch_disable_bits(); + if (prefetch_disable_bits == 0) { + rdt_last_cmd_puts("pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @_cbm: CBM to test + * + * @d represents a cache instance and @_cbm a capacity bitmask that is + * considered for it. Determine if @_cbm overlaps with any existing + * pseudo-locked region on @d. + * + * Return: true if @_cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) +{ + unsigned long *cbm = (unsigned long *)&_cbm; + unsigned long *cbm_b; + unsigned int cbm_len; + + if (d->plr) { + cbm_len = d->plr->r->cache.cbm_len; + cbm_b = (unsigned long *)&d->plr->cbm; + if (bitmap_intersects(cbm, cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +{ + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + struct rdt_domain *d_i; + bool ret = false; + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d_i, &r->domains, list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * @_plr: pseudo-lock region to measure + * + * There is no deterministic way to test if a memory region is cached. One + * way is to measure how long it takes to read the memory, the speed of + * access is a good way to learn how close to the cpu the data was. Even + * more, if the prefetcher is disabled and the memory is read at a stride + * of half the cache line, then a cache miss will be easy to spot since the + * read of the first half would be significantly slower than the read of + * the second half. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int measure_cycles_lat_fn(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + unsigned long i; + u64 start, end; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer to access memory. + * The cost is that accessing this pointer, which could be in + * cache, will be included in the measurement of memory read latency. + */ + void *mem_r; +#else +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + local_irq_disable(); + /* + * The wrmsr call may be reordered with the assignment below it. + * Call wrmsr as directly as possible to avoid tracing clobbering + * local register variable used for memory pointer. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = plr->kmem; + /* + * Dummy execute of the time measurement to load the needed + * instructions into the L1 instruction cache. + */ + start = rdtsc_ordered(); + for (i = 0; i < plr->size; i += 32) { + start = rdtsc_ordered(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + end = rdtsc_ordered(); + trace_pseudo_lock_mem_latency((u32)(end - start)); + } + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_cycles_perf_fn(void *_plr) +{ + unsigned long long l3_hits = 0, l3_miss = 0; + u64 l3_hit_bits = 0, l3_miss_bits = 0; + struct pseudo_lock_region *plr = _plr; + unsigned long long l2_hits, l2_miss; + u64 l2_hit_bits, l2_miss_bits; + unsigned long i; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use regular variables + * at the cost of including cache access latency to these variables + * in the measurements. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + /* + * Non-architectural event for the Goldmont Microarchitecture + * from Intel x86 Architecture Software Developer Manual (SDM): + * MEM_LOAD_UOPS_RETIRED D1H (event number) + * Umask values: + * L1_HIT 01H + * L2_HIT 02H + * L1_MISS 08H + * L2_MISS 10H + * + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform we use the following events instead: + * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) + * REFERENCES FFH + * MISS 3FH + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H + */ + + /* + * Start by setting flags for IA32_PERFEVTSELx: + * OS (Operating system mode) 0x2 + * INT (APIC interrupt enable) 0x10 + * EN (Enable counter) 0x40 + * + * Then add the Umask value and event number to select performance + * event. + */ + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GEMINI_LAKE: + l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; + l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; + break; + case INTEL_FAM6_BROADWELL_X: + /* On BDW the l2_hit_bits count references, not hits */ + l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; + l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; + /* On BDW the l3_hit_bits count references, not hits */ + l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; + l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; + break; + default: + goto out; + } + + local_irq_disable(); + /* + * Call wrmsr direcly to avoid the local register variables from + * being overwritten due to reordering of their assignment with + * the wrmsr calls. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + /* Disable events and reset counters */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); + } + /* Set and enable the L2 counters */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, + l3_hit_bits); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, + l3_miss_bits); + } + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + for (i = 0; i < size; i += line_size) { + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Call wrmsr directly (no tracing) to not influence + * the cache access counters as they are disabled. + */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, + l2_hit_bits & ~(0x40ULL << 16)); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, + l2_miss_bits & ~(0x40ULL << 16)); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, + l3_hit_bits & ~(0x40ULL << 16)); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, + l3_miss_bits & ~(0x40ULL << 16)); + } + l2_hits = native_read_pmc(0); + l2_miss = native_read_pmc(1); + if (l3_hit_bits > 0) { + l3_hits = native_read_pmc(2); + l3_miss = native_read_pmc(3); + } + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + /* + * On BDW we count references and misses, need to adjust. Sometimes + * the "hits" counter is a bit more than the references, for + * example, x references but x + 1 hits. To not report invalid + * hit values in this case we treat that as misses eaqual to + * references. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) + l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); + trace_pseudo_lock_l2(l2_hits, l2_miss); + if (l3_hit_bits > 0) { + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) + l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); + trace_pseudo_lock_l3(l3_hits, l3_miss); + } + +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + if (sel == 1) + thread = kthread_create_on_node(measure_cycles_lat_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 2) + thread = kthread_create_on_node(measure_cycles_perf_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + kthread_bind(thread, cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + + plr->thread_done = 0; + + thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, + cpu_to_node(plr->cpu), + "pseudo_lock/%u", plr->cpu); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("locking thread returned error %d\n", ret); + goto out_cstates; + } + + kthread_bind(thread, plr->cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will derefence + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_sem semaphore which is obtained in + * the device_create() and debugfs_create_dir() callpath below + * as well as before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, + debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", rdtgrp->kn->name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus assymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); + if (IS_ERR(pseudo_lock_class)) { + ret = PTR_ERR(pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + pseudo_lock_class->devnode = pseudo_lock_devnode; + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_destroy(pseudo_lock_class); + pseudo_lock_class = NULL; + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h new file mode 100644 index 000000000000..2c041e6d9f05 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PSEUDO_LOCK_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(pseudo_lock_mem_latency, + TP_PROTO(u32 latency), + TP_ARGS(latency), + TP_STRUCT__entry(__field(u32, latency)), + TP_fast_assign(__entry->latency = latency), + TP_printk("latency=%u", __entry->latency) + ); + +TRACE_EVENT(pseudo_lock_l2, + TP_PROTO(u64 l2_hits, u64 l2_miss), + TP_ARGS(l2_hits, l2_miss), + TP_STRUCT__entry(__field(u64, l2_hits) + __field(u64, l2_miss)), + TP_fast_assign(__entry->l2_hits = l2_hits; + __entry->l2_miss = l2_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l2_hits, __entry->l2_miss)); + +TRACE_EVENT(pseudo_lock_l3, + TP_PROTO(u64 l3_hits, u64 l3_miss), + TP_ARGS(l3_hits, l3_miss), + TP_STRUCT__entry(__field(u64, l3_hits) + __field(u64, l3_miss)), + TP_fast_assign(__entry->l3_hits = l3_hits; + __entry->l3_miss = l3_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l3_hits, __entry->l3_miss)); + +#endif /* _TRACE_PSEUDO_LOCK_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event +#include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 749856a2e736..b799c00bef09 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -20,7 +20,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheinfo.h> #include <linux/cpu.h> +#include <linux/debugfs.h> #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernfs.h> @@ -55,6 +57,8 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +struct dentry *debugfs_resctrl; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -121,11 +125,65 @@ static int closid_alloc(void) return closid; } -static void closid_free(int closid) +void closid_free(int closid) { closid_free_map |= 1 << closid; } +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +static bool closid_allocated(unsigned int closid) +{ + return (closid_free_map & (1 << closid)) == 0; +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + /* set uid and gid of rdtgroup dirs and files to that of the creator */ static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -146,6 +204,7 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) int ret; kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, rft->kf_ops, rft, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -207,8 +266,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); + else + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); } else { ret = -ENOENT; } @@ -394,6 +457,13 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto unlock; + } + if (is_cpu_list(of)) ret = cpulist_parse(buf, newmask); else @@ -509,6 +579,32 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return ret; } +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + static int rdtgroup_task_write_permission(struct task_struct *task, struct kernfs_open_file *of) { @@ -570,13 +666,22 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } rdt_last_cmd_clear(); - if (rdtgrp) - ret = rdtgroup_move_task(pid, rdtgrp, of); - else - ret = -ENOENT; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto unlock; + } + ret = rdtgroup_move_task(pid, rdtgrp, of); + +unlock: rdtgroup_kn_unlock(of->kn); return ret ?: nbytes; @@ -662,6 +767,94 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } +/** + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + u32 sw_shareable = 0, hw_shareable = 0; + u32 exclusive = 0, pseudo_locked = 0; + struct rdt_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 *ctrl; + + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_putc(seq, ';'); + ctrl = dom->ctrl_val; + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->id); + for (i = 0; i < r->num_closid; i++, ctrl++) { + if (!closid_allocated(i)) + continue; + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= *ctrl; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= *ctrl; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, (unsigned long *)&hw_shareable); + swb = test_bit(i, (unsigned long *)&sw_shareable); + excl = test_bit(i, (unsigned long *)&exclusive); + psl = test_bit(i, (unsigned long *)&pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -740,6 +933,269 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, return nbytes; } +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * Return: false if CBM does not overlap, true if it does. + */ +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + u32 _cbm, int closid, bool exclusive) +{ + unsigned long *cbm = (unsigned long *)&_cbm; + unsigned long *ctrl_b; + enum rdtgrp_mode mode; + u32 *ctrl; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + if (bitmap_intersects(cbm, + (unsigned long *)&r->cache.shareable_bits, + r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + ctrl = d->ctrl_val; + for (i = 0; i < r->num_closid; i++, ctrl++) { + ctrl_b = (unsigned long *)ctrl; + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_resource *r; + struct rdt_domain *d; + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d, &r->domains, list) { + if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], + rdtgrp->closid, false)) + return false; + } + } + + return true; +} + +/** + * rdtgroup_mode_write - Modify the resource group's mode + * + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_printf("cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + rdt_last_cmd_printf("schemata overlaps\n"); + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (!strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_printf("unknown/unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_domain *d, u32 cbm) +{ + struct cpu_cacheinfo *ci; + unsigned int size = 0; + int num_b, i; + + num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == r->cache_level) { + size = ci->info_list[i].size / r->cache.cbm_len * num_b; + break; + } + } + + return size; +} + +/** + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + * + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct rdt_domain *d; + unsigned int size; + bool sep = false; + u32 cbm; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + goto out; + } + + for_each_alloc_enabled_rdt_resource(r) { + seq_printf(s, "%*s:", max_name_width, r->name); + list_for_each_entry(d, &r->domains, list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + cbm = d->ctrl_val[rdtgrp->closid]; + size = rdtgroup_cbm_to_size(r, d, cbm); + } + seq_printf(s, "%d=%u", d->id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return 0; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -792,6 +1248,13 @@ static struct rftype res_common_files[] = { .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -853,6 +1316,22 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_schemata_show, .fflags = RF_CTRL_BASE, }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RF_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RF_CTRL_BASE, + }, + }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) @@ -883,6 +1362,103 @@ error: return ret; } +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, unsigned long fflags) { @@ -1224,6 +1800,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); kernfs_unbreak_active_protection(kn); kernfs_put(rdtgrp->kn); kfree(rdtgrp); @@ -1289,10 +1868,16 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, rdtgroup_default.mon.mon_data_kn = kn_mondata; } + ret = rdt_pseudo_lock_init(); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_mondata; + goto out_psl; if (rdt_alloc_capable) static_branch_enable_cpuslocked(&rdt_alloc_enable_key); @@ -1310,6 +1895,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out; +out_psl: + rdt_pseudo_lock_release(); out_mondata: if (rdt_mon_capable) kernfs_remove(kn_mondata); @@ -1447,6 +2034,10 @@ static void rmdir_all_sub(void) if (rdtgrp == &rdtgroup_default) continue; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + /* * Give any CPUs back to the default group. We cannot copy * cpu_online_mask because a CPU might have executed the @@ -1483,6 +2074,8 @@ static void rdt_kill_sb(struct super_block *sb) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -1503,7 +2096,8 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, struct kernfs_node *kn; int ret = 0; - kn = __kernfs_create_file(parent_kn, name, 0444, 0, + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, &kf_mondata_ops, priv, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -1682,6 +2276,114 @@ out_destroy: return ret; } +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) +{ + /* + * Convert the u32 _val to an unsigned long required by all the bit + * operations within this function. No more than 32 bits of this + * converted value can be accessed because all bit operations are + * additionally provided with cbm_len that is initialized during + * hardware enumeration using five bits from the EAX register and + * thus never can exceed 32 bits. + */ + unsigned long *val = (unsigned long *)_val; + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + + if (*val == 0) + return; + + first_bit = find_first_bit(val, cbm_len); + zero_bit = find_next_zero_bit(val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(val, zero_bit, cbm_len - zero_bit); +} + +/** + * rdtgroup_init_alloc - Initialize the new RDT group's allocations + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. That is, all shareable and unused bits. + * + * All-zero CBM is invalid. If there are no more shareable bits available + * on any domain then the entire allocation will fail. + */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + u32 used_b = 0, unused_b = 0; + u32 closid = rdtgrp->closid; + struct rdt_resource *r; + enum rdtgrp_mode mode; + struct rdt_domain *d; + int i, ret; + u32 *ctrl; + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d, &r->domains, list) { + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < r->num_closid; i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; + used_b |= *ctrl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + if (bitmap_weight((unsigned long *) &d->new_ctrl, + r->cache.cbm_len) < + r->cache.min_cbm_bits) { + rdt_last_cmd_printf("no space on %s:%d\n", + r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + } + } + + for_each_alloc_enabled_rdt_resource(r) { + ret = update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("failed to initialize allocations\n"); + return ret; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } + + return 0; +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, struct kernfs_node *prgrp_kn, const char *name, umode_t mode, @@ -1700,6 +2402,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto out_unlock; + } + /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { @@ -1840,6 +2550,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_id_free; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); if (rdt_mon_capable) { @@ -1850,15 +2564,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_id_free; + goto out_del_list; } } goto out_unlock; +out_del_list: + list_del(&rdtgrp->rdtgroup_list); out_id_free: closid_free(closid); - list_del(&rdtgrp->rdtgroup_list); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); out_unlock: @@ -1945,6 +2660,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, return 0; } +static int rdtgroup_ctrl_remove(struct kernfs_node *kn, + struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + return 0; +} + static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { @@ -1970,7 +2700,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); - rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); @@ -1979,14 +2708,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, */ free_all_child_rdtgrp(rdtgrp); - list_del(&rdtgrp->rdtgroup_list); - - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); - kernfs_remove(rdtgrp->kn); + rdtgroup_ctrl_remove(kn, rdtgrp); return 0; } @@ -2014,13 +2736,19 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) * If the rdtgroup is a mon group and parent directory * is a valid "mon_groups" directory, remove the mon group. */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); - else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, kn->name)) + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(kn, rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) { ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); - else + } else { ret = -EPERM; + } out: rdtgroup_kn_unlock(kn); @@ -2046,7 +2774,8 @@ static int __init rdtgroup_setup_root(void) int ret; rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, &rdtgroup_default); if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); @@ -2102,6 +2831,29 @@ int __init rdtgroup_init(void) if (ret) goto cleanup_mountpoint; + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_sem + * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + return 0; cleanup_mountpoint: @@ -2111,3 +2863,11 @@ cleanup_root: return ret; } + +void __exit rdtgroup_exit(void) +{ + debugfs_remove_recursive(debugfs_resctrl); + unregister_filesystem(&rdt_fs_type); + sysfs_remove_mount_point(fs_kobj, "resctrl"); + kernfs_destroy_root(rdt_root); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c102ad51025e..4b767284b7f5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - /* We hope get_seconds stays lockless */ - m->time = get_seconds(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; @@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn) } #endif + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static bool __mc_check_crashing_cpu(int cpu) +{ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn) */ void do_machine_check(struct pt_regs *regs, long error_code) { + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; + int cpu = smp_processor_id(); + char *msg = "Unknown"; struct mce m, *final; - int i; int worst = 0; - int severity; /* * Establish sequential order between the CPUs entering the machine * check handler. */ int order = -1; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; + /* * If kill_it gets set, there might be a way to recover from this * error. */ int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - char *msg = "Unknown"; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ int lmce = 1; - int cpu = smp_processor_id(); - - /* - * Cases where we avoid rendezvous handler timeout: - * 1) If this CPU is offline. - * - * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to - * skip those CPUs which remain looping in the 1st kernel - see - * crash_nmi_callback(). - * - * Note: there still is a small window between kexec-ing and the new, - * kdump kernel establishing a new #MC handler where a broadcasted MCE - * might not get handled properly. - */ - if (cpu_is_offline(cpu) || - (crashing_cpu != -1 && crashing_cpu != cpu)) { - u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - return; - } - } + if (__mc_check_crashing_cpu(cpu)) + return; ist_enter(regs); this_cpu_inc(mce_exception_count); - if (!cfg->banks) - goto out; - mce_gather_info(&m, regs); m.tsc = rdtsc(); @@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); } - for (i = 0; i < cfg->banks; i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) - continue; - if (!mce_banks[i].ctl) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - m.status = mce_rdmsrl(msr_ops.status(i)); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected or non signaled errors are handled by - * machine_check_poll. Leave them alone, unless this panics. - */ - if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && - !no_way_out) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - severity = mce_severity(&m, cfg->tolerant, NULL, true); - - /* - * When machine check was for corrected/deferred handler don't - * touch, unless we're panicing. - */ - if ((severity == MCE_KEEP_SEVERITY || - severity == MCE_UCNA_SEVERITY) && !no_way_out) - continue; - __set_bit(i, toclear); - if (severity == MCE_NO_SEVERITY) { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - mce_read_aux(&m, i); - - /* assuming valid severity level != 0 */ - m.severity = severity; - - mce_log(&m); - - if (severity > worst) { - *final = m; - worst = severity; - } - } - - /* mce_clear_state will clear *final, save locally for use later */ - m = *final; + __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); -out: + sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) @@ -2165,9 +2177,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08286269fd24..b9bc8a1a584e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev; static int check_online_cpus(void) { - if (num_online_cpus() == num_present_cpus()) - return 0; + unsigned int cpu; - pr_err("Not all CPUs online, aborting microcode update.\n"); + /* + * Make sure all CPUs are online. It's fine for SMT to be disabled if + * all the primary threads are still online. + */ + for_each_present_cpu(cpu) { + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { + pr_err("Not all CPUs online, aborting microcode update.\n"); + return -EINVAL; + } + } - return -EINVAL; + return 0; } static atomic_t late_cpus_in; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 031082c96db8..ad12733f6058 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -41,7 +41,7 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -void hyperv_vector_handler(struct pt_regs *regs) +__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,7 +50,7 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); - if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); exiting_irq(); @@ -300,7 +300,7 @@ static void __init ms_hyperv_init_platform(void) hyperv_reenlightenment_vector); /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_X64_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) alloc_intr_gate(HYPERV_STIMER0_VECTOR, hv_stimer0_callback_vector); #endif diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 4021d3859499..40eee6cc4124 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -106,7 +106,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) memset(line, 0, LINE_SIZE); - length = strncpy_from_user(line, buf, LINE_SIZE - 1); + len = min_t(size_t, len, LINE_SIZE - 1); + length = strncpy_from_user(line, buf, len); if (length < 0) return length; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 81c0afb39d0a..71ca064e3794 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -22,18 +22,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; + unsigned int eax, ebx, ecx, edx; if (c->cpuid_level < 0xb) return -1; @@ -52,10 +44,30 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; + smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); +#endif + return 0; +} + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +int detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (detect_extended_topology_early(c) < 0) + return -1; /* * Populate HT related information from sub-leaf level 0. */ + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -86,15 +98,6 @@ int detect_extended_topology(struct cpuinfo_x86 *c) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif return 0; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 666a284116ac..9c8652974f8e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,8 +22,6 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -#define OPCODE_BUFSIZE 64 - int panic_on_unrecovered_nmi; int panic_on_io_nmi; static int die_counter; @@ -93,26 +91,18 @@ static void printk_stack_address(unsigned long address, int reliable, */ void show_opcodes(u8 *rip, const char *loglvl) { - unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; +#define PROLOGUE_SIZE 42 +#define EPILOGUE_SIZE 21 +#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) u8 opcodes[OPCODE_BUFSIZE]; - u8 *ip; - int i; - - printk("%sCode: ", loglvl); - - ip = (u8 *)rip - code_prologue; - if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) { - pr_cont("Bad RIP value.\n"); - return; - } - for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) { - if (ip == rip) - pr_cont("<%02x> ", opcodes[i]); - else - pr_cont("%02x ", opcodes[i]); + if (probe_kernel_read(opcodes, rip - PROLOGUE_SIZE, OPCODE_BUFSIZE)) { + printk("%sCode: Bad RIP value.\n", loglvl); + } else { + printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" + __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, + opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); } - pr_cont("\n"); } void show_ip(struct pt_regs *regs, const char *loglvl) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index da5d8ac60062..50d5848bf22e 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -338,6 +338,18 @@ static resource_size_t __init gen3_stolen_base(int num, int slot, int func, return bsm & INTEL_BSM_MASK; } +static resource_size_t __init gen11_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u64 bsm; + + bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0); + bsm &= INTEL_BSM_MASK; + bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32; + + return bsm; +} + static resource_size_t __init i830_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -498,6 +510,11 @@ static const struct intel_early_ops chv_early_ops __initconst = { .stolen_size = chv_stolen_size, }; +static const struct intel_early_ops gen11_early_ops __initconst = { + .stolen_base = gen11_stolen_base, + .stolen_size = gen9_stolen_size, +}; + static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -529,6 +546,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CFL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), INTEL_CNL_IDS(&gen9_early_ops), + INTEL_ICL_11_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index f92a6593de1e..2ea85b32421a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -10,6 +10,7 @@ #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> +#include <asm/irq_regs.h> #include <linux/hardirq.h> #include <linux/pkeys.h> diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index abe6df15a8fb..30f9cb2c0b55 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -512,11 +512,18 @@ ENTRY(initial_code) ENTRY(setup_once_ref) .long setup_once +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif /* * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE + .align PGD_ALIGN #ifdef CONFIG_X86_PAE .globl initial_pg_pmd initial_pg_pmd: @@ -526,14 +533,17 @@ initial_pg_pmd: initial_page_table: .fill 1024,4,0 #endif + .align PGD_ALIGN initial_pg_fixmap: .fill 1024,4,0 -.globl empty_zero_page -empty_zero_page: - .fill 4096,1,0 .globl swapper_pg_dir + .align PGD_ALIGN swapper_pg_dir: .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 +.globl empty_zero_page +empty_zero_page: + .fill 4096,1,0 EXPORT_SYMBOL(empty_zero_page) /* @@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE + .align PGD_ALIGN ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..15ebc2fc166e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -235,7 +235,7 @@ ENTRY(secondary_startup_64) * address given in m16:64. */ pushq $.Lafter_lret # put return address on stack for unwinder - xorq %rbp, %rbp # clear frame pointer + xorl %ebp, %ebp # clear frame pointer movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 346b24883911..b0acb22e5a46 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,6 +1,7 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/errno.h> diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..34a5c1715148 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_dr_addr_mask(0, i); } -/* - * Check for virtual address in kernel space. - */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +static int arch_bp_generic_len(int x86_len) { - unsigned int len; - unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - va = info->address; - len = bp->attr.bp_len; - - /* - * We don't need to worry about va + len - 1 overflowing: - * we already require that va is aligned to a multiple of len. - */ - return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { + int len; + /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: @@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type, } /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_1: - *gen_len = HW_BREAKPOINT_LEN_1; - break; - case X86_BREAKPOINT_LEN_2: - *gen_len = HW_BREAKPOINT_LEN_2; - break; - case X86_BREAKPOINT_LEN_4: - *gen_len = HW_BREAKPOINT_LEN_4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - *gen_len = HW_BREAKPOINT_LEN_8; - break; -#endif - default: + len = arch_bp_generic_len(x86_len); + if (len < 0) return -EINVAL; - } + *gen_len = len; return 0; } - -static int arch_build_bp_info(struct perf_event *bp) +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long va; + int len; - info->address = bp->attr.bp_addr; + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + hw->address = attr->bp_addr; + hw->mask = 0; /* Type */ - switch (bp->attr.bp_type) { + switch (attr->bp_type) { case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; + hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; + hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* @@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp) * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ - if (bp->attr.bp_addr >= TASK_SIZE_MAX) { + if (attr->bp_addr >= TASK_SIZE_MAX) { #ifdef CONFIG_KPROBES - if (within_kprobe_blacklist(bp->attr.bp_addr)) + if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; #else return -EINVAL; #endif } - info->type = X86_BREAKPOINT_EXECUTE; + hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ - if (bp->attr.bp_len == sizeof(long)) { - info->len = X86_BREAKPOINT_LEN_X; + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; return 0; } default: @@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp) } /* Len */ - info->mask = 0; - - switch (bp->attr.bp_len) { + switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: - info->len = X86_BREAKPOINT_LEN_1; + hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - info->len = X86_BREAKPOINT_LEN_2; + hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - info->len = X86_BREAKPOINT_LEN_4; + hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: - info->len = X86_BREAKPOINT_LEN_8; + hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ - if (!is_power_of_2(bp->attr.bp_len)) + if (!is_power_of_2(attr->bp_len)) return -EINVAL; - if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - info->mask = bp->attr.bp_len - 1; - info->len = X86_BREAKPOINT_LEN_1; + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; } return 0; @@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp) /* * Validate the arch-specific HW Breakpoint register settings */ -int arch_validate_hwbkpt_settings(struct perf_event *bp) +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned int align; int ret; - ret = arch_build_bp_info(bp); + ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - switch (info->len) { + switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; - if (info->mask) - align = info->mask; + if (hw->mask) + align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; @@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (info->address & align) + if (hw->address & align) return -EINVAL; return 0; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 86c4439f9d74..519649ddf100 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/init.h> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 74383a3780dc..01adea278a71 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -8,6 +8,7 @@ #include <asm/traps.h> #include <asm/proto.h> #include <asm/desc.h> +#include <asm/hw_irq.h> struct idt_data { unsigned int vector; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 328d027d829d..59b5f2ea7c2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -10,6 +10,7 @@ #include <linux/ftrace.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/irq.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c1bdbd3d3232..95600a99ae93 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -11,6 +11,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/kernel_stat.h> #include <linux/notifier.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d86e344f5b3d..0469cd078db1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,6 +11,7 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/delay.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000000..ddeeaac8adda --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/asm.h> +#include <asm/export.h> +#include <linux/linkage.h> + +/* + * unsigned long native_save_fl(void) + */ +ENTRY(native_save_fl) + pushf + pop %_ASM_AX + ret +ENDPROC(native_save_fl) +EXPORT_SYMBOL(native_save_fl) + +/* + * void native_restore_fl(unsigned long flags) + * %eax/%rdi: flags + */ +ENTRY(native_restore_fl) + push %_ASM_ARG1 + popf + ret +ENDPROC(native_restore_fl) +EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 772196c1b8c4..a0693b71cfc1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/kprobes.h> diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 7326078eaa7a..278cd07228dd 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -532,7 +532,7 @@ static int bzImage64_cleanup(void *loader_data) static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) { return verify_pefile_signature(kernel, kernel_len, - NULL, + VERIFY_USE_SECONDARY_KEYRING, VERIFYING_KEXEC_PE_SIGNATURE); } #endif diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index ae38dccf0c8f..2b949f4fd4d8 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig } #endif -#ifdef CONFIG_KPROBES_ON_FTRACE -extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb); -#else -static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - return 0; -} -#endif #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6f4d42377fe5..b0d1e81c96bb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -66,8 +66,6 @@ #include "common.h" -void jprobe_return_end(void); - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif @@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, * Raise a BUG or we'll continue in an endless reentering loop * and eventually a stack overflow. */ - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", - p->addr); + pr_err("Unrecoverable kprobe detected.\n"); dump_kprobe(p); BUG(); default: @@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs) addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); /* - * We don't want to be preempted for the entire - * duration of kprobe processing. We conditionally - * re-enable preemption at the end of this function, - * and also in reenter_kprobe() and setup_singlestep(). + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. */ - preempt_disable(); kcb = get_kprobe_ctlblk(); p = get_kprobe(addr); @@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry - * for jprobe processing, so get out doing nothing - * more here. + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. */ if (!p->pre_handler || !p->pre_handler(p, regs)) setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); return 1; } } else if (*addr != BREAKPOINT_INSTRUCTION) { @@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs) * the original instruction. */ regs->ip = (unsigned long)addr; - preempt_enable_no_resched(); return 1; - } else if (kprobe_running()) { - p = __this_cpu_read(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - if (!skip_singlestep(p, regs, kcb)) - setup_singlestep(p, regs, kcb, 0); - return 1; - } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); return 0; } NOKPROBE_SYMBOL(kprobe_int3_handler); @@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); - /* * if somebody else is singlestepping across a probe point, flags * will have TF set, in which case, continue the remaining processing @@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* @@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_sp = stack_addr(regs); - addr = (unsigned long)(kcb->jprobe_saved_sp); - - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy - * raw stack chunk with redzones: - */ - __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->ip = (unsigned long)(jp->entry); - - /* - * jprobes use jprobe_return() which skips the normal return - * path of the function, and this messes up the accounting of the - * function graph tracer to get messed up. - * - * Pause function graph tracing while performing the jprobe function. - */ - pause_graph_tracing(); - return 1; -} -NOKPROBE_SYMBOL(setjmp_pre_handler); - -void jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - /* Unpoison stack redzones in the frames we are going to jump over. */ - kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); - - asm volatile ( -#ifdef CONFIG_X86_64 - " xchg %%rbx,%%rsp \n" -#else - " xchgl %%ebx,%%esp \n" -#endif - " int3 \n" - " .globl jprobe_return_end\n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_sp):"memory"); -} -NOKPROBE_SYMBOL(jprobe_return); -NOKPROBE_SYMBOL(jprobe_return_end); - -int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->ip - 1); - struct jprobe *jp = container_of(p, struct jprobe, kp); - void *saved_sp = kcb->jprobe_saved_sp; - - if ((addr > (u8 *) jprobe_return) && - (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != saved_sp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk(KERN_ERR - "current sp %p does not match saved sp %p\n", - stack_addr(regs), saved_sp); - printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_regs(saved_regs); - printk(KERN_ERR "Current registers\n"); - show_regs(regs); - BUG(); - } - /* It's OK to start function graph tracing again */ - unpause_graph_tracing(); - *regs = kcb->jprobe_saved_regs; - __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(longjmp_break_handler); - bool arch_within_kprobe_blacklist(unsigned long addr) { bool is_in_entry_trampoline_section = false; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..ef819e19650b 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,36 +25,6 @@ #include "common.h" -static nokprobe_inline -void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, unsigned long orig_ip) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); - if (orig_ip) - regs->ip = orig_ip; -} - -int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - if (kprobe_ftrace(p)) { - __skip_singlestep(p, regs, kcb, 0); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(skip_singlestep); - /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); - /* To emulate trap based kprobes, preempt_disable here */ - preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - __skip_singlestep(p, regs, kcb, orig_ip); - preempt_enable_no_resched(); + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; } /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe, and keep preempt count +1. + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. */ + __this_cpu_write(current_kprobe, NULL); } } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 203d398802a3..eaf02f2e7300 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; if (!reenter) reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..0f471bd93417 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> -#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -154,7 +144,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +178,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) @@ -454,6 +444,98 @@ static void __init sev_map_percpu_data(void) } #ifdef CONFIG_SMP +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + __send_ipi_mask(local_mask, vector); +} + +static void kvm_send_ipi_allbutself(int vector) +{ + kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void kvm_send_ipi_all(int vector) +{ + __send_ipi_mask(cpu_online_mask, vector); +} + +/* + * Set the IPI entry points + */ +static void kvm_setup_pv_ipi(void) +{ + apic->send_IPI_mask = kvm_send_ipi_mask; + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = kvm_send_ipi_allbutself; + apic->send_IPI_all = kvm_send_ipi_all; + pr_info("KVM setup pv IPIs\n"); +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -560,9 +642,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; @@ -624,12 +703,27 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } +static void __init kvm_apic_init(void) +{ +#if defined(CONFIG_SMP) + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + kvm_setup_pv_ipi(); +#endif +} + +static void __init kvm_init_platform(void) +{ + kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; +} + const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, }; static __init int activate_jump_labels(void) @@ -748,6 +842,10 @@ void __init kvm_spinlock_init(void) if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; + /* Don't use the pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + return; + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index bf8d1eb7fca3..1e6764648af3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,30 +23,57 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/memblock.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/hypervisor.h> #include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; } early_param("no-kvmclock", parse_no_kvmclock); -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -55,21 +82,10 @@ static struct pvclock_wall_clock *wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); - - put_cpu(); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -79,14 +95,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -112,11 +124,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -130,18 +142,11 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); - return tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; @@ -155,49 +160,40 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); + if (!src) + return; - return ret; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -212,11 +208,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -244,98 +236,84 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) +static int __init kvm_setup_vsyscall_timeinfo(void) { - phys_addr_t mem; +#ifdef CONFIG_X86_64 + u8 flags; - mem = memblock_alloc(size, align); - if (!mem) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - return mem; -e_free: - memblock_free(mem, size); + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif return 0; } +early_initcall(kvm_setup_vsyscall_timeinfo); -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +static int kvmclock_setup_percpu(unsigned int cpu) { - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); + + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); - memblock_free(addr, size); + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; } void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; } - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { return; } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -347,34 +325,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - pvclock_set_pvti_cpu0_va(hv_clock); - put_cpu(); - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c9b14020f4dd..733e6ace0fa4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -100,6 +100,102 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#endif /* CONFIG_X86_PAE */ + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. @@ -115,9 +211,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - bool is_vmalloc, had_top_level_entry; unsigned long va; + bool is_vmalloc; spinlock_t *ptl; pgd_t *pgd; int i; @@ -131,13 +226,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) */ WARN_ON(ldt->slot != -1); + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + /* * Did we already have the top level entry allocated? We can't * use pgd_none() for this because it doens't do anything on * 4-level page table kernels. */ pgd = pgd_offset(mm, LDT_BASE_ADDR); - had_top_level_entry = (pgd->pgd != 0); is_vmalloc = is_vmalloc_addr(ldt->entries); @@ -172,41 +269,31 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) pte_unmap_unlock(ptep, ptl); } - if (mm->context.ldt) { - /* - * We already had an LDT. The top-level entry should already - * have been allocated and synchronized with the usermode - * tables. - */ - WARN_ON(!had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) - WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); - } else { - /* - * This is the first time we're mapping an LDT for this process. - * Sync the pgd to the usermode tables. - */ - WARN_ON(had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) { - WARN_ON(kernel_to_user_pgdp(pgd)->pgd); - set_pgd(kernel_to_user_pgdp(pgd), *pgd); - } - } + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); ldt->slot = slot; -#endif return 0; } +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; - unsigned long end = start + (1UL << PGDIR_SHIFT); + unsigned long end = LDT_END_ADDR; if (!static_cpu_has(X86_FEATURE_PTI)) return; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d1ab07ec8c9a..5409c2800ab5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -56,7 +56,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_page((unsigned long)image->arch.pgd); + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { - image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 99dc79e76bdc..930c88341e4e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -88,10 +88,12 @@ unsigned paravirt_patch_call(void *insnbuf, struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); - if (tgt_clobbers & ~site_clobbers) - return len; /* target would clobber too much for this site */ - if (len < 5) + if (len < 5) { +#ifdef CONFIG_RETPOLINE + WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); +#endif return len; /* call too long for patch site */ + } b->opcode = 0xe8; /* call */ b->delta = delta; @@ -106,8 +108,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, struct branch *b = insnbuf; unsigned long delta = (unsigned long)target - (addr+5); - if (len < 5) + if (len < 5) { +#ifdef CONFIG_RETPOLINE + WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); +#endif return len; /* call too long for patch site */ + } b->opcode = 0xe9; /* jmp */ b->delta = delta; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 9edadabf04f6..9cb98f7b07c9 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ab5d9dd668d2..acfd04121da3 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -155,9 +155,6 @@ static int __init pci_iommu_init(void) { struct iommu_table_entry *p; -#ifdef CONFIG_PCI - dma_debug_add_bus(&pci_bus_type); -#endif x86_init.iommu.iommu_init(); for (p = __iommu_table; p < __iommu_table_end; p++) { @@ -175,7 +172,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.dma_32bit_limit = true; + pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); return 0; } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 4dfd90a75e63..2e9006c1e240 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -60,7 +60,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start, printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", p->detect, q->detect); /* Heavy handed way..*/ - x->depend = 0; + x->depend = NULL; } } diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index da5190a1ea16..4a710ffffd9a 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -9,6 +9,6 @@ static __init int add_pcspkr(void) pd = platform_device_register_simple("pcspkr", -1, NULL, 0); - return IS_ERR(pd) ? PTR_ERR(pd) : 0; + return PTR_ERR_OR_ZERO(pd); } device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 30ca2d1a9231..c93fcfdf1673 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, -#ifdef CONFIG_X86_64 /* * .sp1 is cpu_current_top_of_stack. The init task never * runs user code, but cpu_current_top_of_stack should still * be well defined before the first context switch. */ .sp1 = TOP_OF_INIT_STACK, -#endif #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..2924fd447e61 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - update_sp0(next_p); + update_task_stack(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 12bb445fb98d..476e3ddf8890 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ - update_sp0(next_p); + update_task_stack(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..b4866badb235 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -823,6 +823,12 @@ void __init setup_arch(char **cmdline_p) memblock_reserve(__pa_symbol(_text), (unsigned long)__bss_stop - (unsigned long)_text); + /* + * Make sure page 0 is always reserved because on systems with + * L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, PAGE_SIZE); + early_reserve_initrd(); /* @@ -866,6 +872,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -991,11 +999,6 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } -#ifdef CONFIG_PCI - if (pci_early_dump_regs) - early_dump_pci_devices(); -#endif - e820__reserve_setup_data(); e820__finish_early_params(); @@ -1012,6 +1015,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1197,11 +1201,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1272,8 +1271,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 5c574dff4c1a..04adc8d60aed 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + kvm_set_cpu_l1tf_flush_l1d(); if (trace_resched_ipi_enabled()) { /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2f7d1d2a5c3..f02ecaf97904 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -80,6 +80,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> +#include <asm/hw_irq.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -221,6 +222,11 @@ static void notrace start_secondary(void *unused) #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); + /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); @@ -266,6 +272,23 @@ static void notrace start_secondary(void *unused) } /** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +bool topology_is_primary_thread(unsigned int cpu) +{ + return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); +} + +/** + * topology_smt_supported - Check whether SMT is supported by the CPUs + */ +bool topology_smt_supported(void) +{ + return smp_num_siblings > 1; +} + +/** * topology_phys_to_logical_pkg - Map a physical package id to a logical * * Returns logical package id or -1 if not found diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 093f2ea5dd56..7627455047c2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -81,16 +81,6 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE -#define STACKTRACE_DUMP_ONCE(task) ({ \ - static bool __section(.data.unlikely) __dumped; \ - \ - if (!__dumped) { \ - __dumped = true; \ - WARN_ON(1); \ - show_stack(task, NULL); \ - } \ -}) - static int __always_inline __save_stack_trace_reliable(struct stack_trace *trace, struct task_struct *task) @@ -99,30 +89,25 @@ __save_stack_trace_reliable(struct stack_trace *trace, struct pt_regs *regs; unsigned long addr; - for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + for (unwind_start(&state, task, NULL, NULL); + !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { regs = unwind_get_entry_regs(&state, NULL); if (regs) { + /* Success path for user tasks */ + if (user_mode(regs)) + goto success; + /* * Kernel mode registers on the stack indicate an * in-kernel interrupt or exception (e.g., preemption * or a page fault), which can make frame pointers * unreliable. */ - if (!user_mode(regs)) - return -EINVAL; - /* - * The last frame contains the user mode syscall - * pt_regs. Skip it and finish the unwind. - */ - unwind_next_frame(&state); - if (!unwind_done(&state)) { - STACKTRACE_DUMP_ONCE(task); + if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; - } - break; } addr = unwind_get_return_address(&state); @@ -132,21 +117,22 @@ __save_stack_trace_reliable(struct stack_trace *trace, * generated code which __kernel_text_address() doesn't know * about. */ - if (!addr) { - STACKTRACE_DUMP_ONCE(task); + if (!addr) return -EINVAL; - } if (save_stack_address(trace, addr, false)) return -EINVAL; } /* Check for stack corruption */ - if (unwind_error(&state)) { - STACKTRACE_DUMP_ONCE(task); + if (unwind_error(&state)) + return -EINVAL; + + /* Success path for non-user tasks, i.e. kthreads and idle tasks */ + if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; - } +success: if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 774ebafa97c4..be01328eb755 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -12,6 +12,7 @@ #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/i8253.h> #include <linux/time.h> #include <linux/export.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -106,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) @@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP @@ -1307,7 +1340,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1341,40 +1374,22 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) -{ - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1383,52 +1398,78 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + loops_per_jiffy = get_loops_per_jiffy(); + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; - if (tsc_disabled > 0) + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } - /* now allow native_sched_clock() to use rdtsc */ + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); @@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,17 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> - * - * This file is released under the GPLv2. */ #include <linux/kernel.h> -#include <asm/processor.h> -#include <asm/setup.h> + #include <asm/apic.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/msr.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 @@ -23,44 +25,48 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index feb28fee6cea..26038eacf74a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -198,7 +198,7 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = cur_orc_table + (a - cur_orc_ip_table); - return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; + return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1; } #ifdef CONFIG_MODULES @@ -352,7 +352,7 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -363,9 +363,9 @@ bool unwind_next_frame(struct unwind_state *state) /* Don't let modules unload while we're reading their ORC data. */ preempt_disable(); - /* Have we reached the end? */ + /* End-of-stack check for user tasks: */ if (state->regs && user_mode(state->regs)) - goto done; + goto the_end; /* * Find the orc_entry associated with the text address. @@ -374,9 +374,16 @@ bool unwind_next_frame(struct unwind_state *state) * calls and calls to noreturn functions. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); - if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) - goto done; - orig_ip = state->ip; + if (!orc) + goto err; + + /* End-of-stack check for kernel threads: */ + if (orc->sp_reg == ORC_REG_UNDEFINED) { + if (!orc->end) + goto err; + + goto the_end; + } /* Find the previous frame's stack: */ switch (orc->sp_reg) { @@ -402,7 +409,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg R10 at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->r10; break; @@ -411,7 +418,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg R13 at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->r13; break; @@ -420,7 +427,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg DI at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->di; break; @@ -429,7 +436,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg DX at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->dx; break; @@ -437,12 +444,12 @@ bool unwind_next_frame(struct unwind_state *state) default: orc_warn("unknown SP base reg %d for ip %pB\n", orc->sp_reg, (void *)state->ip); - goto done; + goto err; } if (indirect) { if (!deref_stack_reg(state, sp, &sp)) - goto done; + goto err; } /* Find IP, SP and possibly regs: */ @@ -451,7 +458,7 @@ bool unwind_next_frame(struct unwind_state *state) ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) - goto done; + goto err; state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->ip, (void *)ip_p); @@ -465,7 +472,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); - goto done; + goto err; } state->regs = (struct pt_regs *)sp; @@ -477,7 +484,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); - goto done; + goto err; } state->regs = (void *)sp - IRET_FRAME_OFFSET; @@ -500,18 +507,18 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_PREV_SP: if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) - goto done; + goto err; break; case ORC_REG_BP: if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) - goto done; + goto err; break; default: orc_warn("unknown BP base reg %d for ip %pB\n", orc->bp_reg, (void *)orig_ip); - goto done; + goto err; } /* Prevent a recursive loop due to bad ORC data: */ @@ -520,13 +527,16 @@ bool unwind_next_frame(struct unwind_state *state) state->sp <= prev_sp) { orc_warn("stack going in the wrong direction? ip=%pB\n", (void *)orig_ip); - goto done; + goto err; } preempt_enable(); return true; -done: +err: + state->error = true; + +the_end: preempt_enable(); state->stack_info.type = STACK_TYPE_UNKNOWN; return false; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9d0b5af7db91..1c03e4aa6474 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - update_sp0(tsk); + update_task_stack(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - update_sp0(tsk); + update_task_stack(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5e1458f609a1..8bde0a419f86 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -55,19 +55,22 @@ jiffies_64 = jiffies; * so we can enable protection checks as well as retain 2MB large page * mappings for kernel text. */ -#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_RODATA_END \ +#define X86_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ - __end_rodata_hpage_align = .; + __end_rodata_hpage_align = .; \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); #else -#define X64_ALIGN_RODATA_BEGIN -#define X64_ALIGN_RODATA_END +#define X86_ALIGN_RODATA_BEGIN +#define X86_ALIGN_RODATA_END \ + . = ALIGN(PAGE_SIZE); \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END @@ -141,9 +144,9 @@ SECTIONS /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); - X64_ALIGN_RODATA_BEGIN + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_RODATA_END + X86_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 92fd433c50b9..1bbec387d289 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -85,7 +85,7 @@ config KVM_AMD_SEV def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" depends on KVM_AMD && X86_64 - depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) ---help--- Provides support for launching Encrypted VMs on AMD processors. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7e042e3d47fd..7bcfa61375c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | - (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | + (1 << KVM_FEATURE_PV_SEND_IPI); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4c4f4263420c..106482da6388 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) maxphyaddr = 36; rsvd = rsvd_bits(maxphyaddr, 63); if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) - rsvd &= ~CR3_PCID_INVD; + rsvd &= ~X86_CR3_PCID_NOFLUSH; } if (new_val & rsvd) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index af8caf965baa..01d209ab5481 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int ret; - if (!synic->active) + if (!synic->active && !host) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, + bool host) { int ret; - if (!synic->active) + if (!synic->active && !host) return 1; ret = 0; @@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_TSC_EMULATION_STATUS: hv->hv_tsc_emulation_status = data; break; + case HV_X64_MSR_TIME_REF_COUNT: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), data, host); } + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: @@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return kvm_hv_set_msr(vcpu, msr, data, host); } -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else - return kvm_hv_get_msr(vcpu, msr, pdata); + return kvm_hv_get_msr(vcpu, msr, pdata, host); } static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 837465d69c6d..d6aa969e20f1 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..0cefba28c864 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit) +{ + int i; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + struct kvm_lapic_irq irq = {0}; + int cluster_size = op_64_bit ? 64 : 32; + int count = 0; + + irq.vector = icr & APIC_VECTOR_MASK; + irq.delivery_mode = icr & APIC_MODE_MASK; + irq.level = (icr & APIC_INT_ASSERT) != 0; + irq.trig_mode = icr & APIC_INT_LEVELTRIG; + + if (icr & APIC_DEST_MASK) + return -KVM_EINVAL; + if (icr & APIC_SHORT_MASK) + return -KVM_EINVAL; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + /* Bits above cluster_size are masked in the caller. */ + for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + min += cluster_size; + for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + rcu_read_unlock(); + return count; +} + static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { @@ -1379,7 +1419,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) * using swait_active() is safe. */ if (swait_active(q)) - swake_up(q); + swake_up_one(q); if (apic_lvtt_tscdeadline(apic)) ktimer->expired_tscdeadline = ktimer->tscdeadline; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d594690d8b95..a282321329b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator { unsigned index; }; -#define for_each_shadow_entry(_vcpu, _addr, _walker) \ +static const union kvm_mmu_page_role mmu_base_role_mask = { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + .smm = 1, + .guest_mode = 1, + .ad_disabled = 1, +}; + +#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ + for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ + (_root), (_addr)); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) @@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | PT64_EPT_EXECUTABLE_MASK; static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + static void mmu_spte_set(u64 *sptep, u64 spte); +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, { unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | + shadow_nonpresent_or_rsvd_mask; + u64 gpa = spte & ~mask; + + gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) + & shadow_nonpresent_or_rsvd_mask; + + return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) @@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_reset_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; @@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void) shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + */ + if (boot_cpu_data.x86_phys_bits < + 52 - shadow_nonpresent_or_rsvd_mask_len) + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(boot_cpu_data.x86_phys_bits - + shadow_nonpresent_or_rsvd_mask_len, + boot_cpu_data.x86_phys_bits - 1); } static int is_cpuid_PSE36(void) @@ -890,7 +942,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; @@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) { } @@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + if (sp->role.cr4_pae != !!is_pae(vcpu) + || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2392,11 +2440,12 @@ out: return sp; } -static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, - struct kvm_vcpu *vcpu, u64 addr) +static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, hpa_t root, + u64 addr) { iterator->addr = addr; - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu.shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && @@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { + /* + * prev_root is currently only used for 64-bit hosts. So only + * the active root_hpa is valid here. + */ + BUG_ON(root != vcpu->arch.mmu.root_hpa); + iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; @@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, } } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + addr); +} + static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) @@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_unsync_page(vcpu, sp); } + /* + * We need to ensure that the marking of unsync pages is visible + * before the SPTE is updated to allow writes because + * kvm_mmu_sync_roots() checks the unsync flags without holding + * the MMU lock and so can race with this. If the SPTE was updated + * before the page had been marked as unsync-ed, something like the + * following could happen: + * + * CPU 1 CPU 2 + * --------------------------------------------------------------------- + * 1.2 Host updates SPTE + * to be writable + * 2.1 Guest writes a GPTE for GVA X. + * (GPTE being in the guest page table shadowed + * by the SP from CPU 1.) + * This reads SPTE during the page table walk. + * Since SPTE.W is read as 1, there is no + * fault. + * + * 2.2 Guest issues TLB flush. + * That causes a VM Exit. + * + * 2.3 kvm_mmu_sync_pages() reads sp->unsync. + * Since it is false, so it just returns. + * + * 2.4 Guest accesses GVA X. + * Since the mapping in the SP was not updated, + * so the old mapping for GVA X incorrectly + * gets used. + * 1.1 Host marks SP + * as unsync + * (sp->unsync = true) + * + * The write barrier below ensures that 1.1 happens before 1.2 and thus + * the situation in 2.4 does not arise. The implicit barrier in 2.2 + * pairs with this write barrier. + */ + smp_wmb(); + return false; } @@ -2724,6 +2825,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) return true; } +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, @@ -2800,7 +2905,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret = 1; + ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } @@ -2816,7 +2921,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) - kvm_flush_remote_tlbs(vcpu->kvm); + ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; done: return ret; } @@ -2827,7 +2932,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, { int was_rmapped = 0; int rmap_count; + int set_spte_ret; int ret = RET_PF_RETRY; + bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2844,22 +2951,25 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, - true, host_writable)) { + set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, + speculative, true, host_writable); + if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { if (write_fault) ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + kvm_flush_remote_tlbs(vcpu->kvm); if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; @@ -3358,26 +3468,47 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, *root_hpa = INVALID_PAGE; } -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) { int i; LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; - if (!VALID_PAGE(mmu->root_hpa)) - return; + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); + + /* Before acquiring the MMU lock, see if we need to do any real work. */ + if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && + VALID_PAGE(mmu->prev_roots[i].hpa)) + break; + + if (i == KVM_MMU_NUM_PREV_ROOTS) + return; + } spin_lock(&vcpu->kvm->mmu_lock); - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); - } else { - for (i = 0; i < 4; ++i) - if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], - &invalid_list); - mmu->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) + mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + &invalid_list); + + if (free_active_root) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, + &invalid_list); + } else { + for (i = 0; i < 4; ++i) + if (mmu->pae_root[i] != 0) + mmu_free_root_page(vcpu->kvm, + &mmu->pae_root[i], + &invalid_list); + mmu->root_hpa = INVALID_PAGE; + } } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3546,7 +3677,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return mmu_alloc_shadow_roots(vcpu); } -static void mmu_sync_roots(struct kvm_vcpu *vcpu) +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; @@ -3558,14 +3689,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + + /* + * Even if another CPU was marking the SP as unsync-ed + * simultaneously, any guest page table changes are not + * guaranteed to be visible anyway until this VCPU issues a TLB + * flush strictly after those changes are made. We only need to + * ensure that the other CPU sets these flags before any actual + * changes to the page tables are made. The comments in + * mmu_need_write_protect() describe what could go wrong if this + * requirement isn't satisfied. + */ + if (!smp_load_acquire(&sp->unsync) && + !smp_load_acquire(&sp->unsync_children)) + return; + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + mmu_sync_children(vcpu, sp); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + spin_unlock(&vcpu->kvm->mmu_lock); return; } + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3575,13 +3731,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); -} -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); spin_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3840,6 +3991,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, { int r = 1; + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: trace_kvm_page_fault(fault_address, error_code); @@ -3947,16 +4099,107 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) +/* + * Find out if a previously cached root matching the new CR3/role is available. + * The current root is also inserted into the cache. + * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is + * returned. + * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and + * false is returned. This root should now be freed by the caller. + */ +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) +{ + uint i; + struct kvm_mmu_root_info root; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + root.cr3 = mmu->get_cr3(vcpu); + root.hpa = mmu->root_hpa; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + swap(root, mmu->prev_roots[i]); + + if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && + page_header(root.hpa) != NULL && + new_role.word == page_header(root.hpa)->role.word) + break; + } + + mmu->root_hpa = root.hpa; + + return i < KVM_MMU_NUM_PREV_ROOTS; +} + +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { - kvm_mmu_free_roots(vcpu); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + /* + * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid + * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs + * later if necessary. + */ + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + mmu->root_level >= PT64_ROOT_4LEVEL) { + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) + return false; + + if (cached_root_available(vcpu, new_cr3, new_role)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU + * generation number. However, that is accompanied by + * KVM_REQ_MMU_RELOAD, which will free the root that we + * have set here and allocate a new one. + */ + + kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); + if (!skip_tlb_flush) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); + } + + /* + * The last MMIO access's GVA and GPA are cached in the + * VCPU. When switching to a new CR3, that GVA->GPA + * mapping may no longer be valid. So clear any cached + * MMIO info even when we don't need to sync the shadow + * page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); + + __clear_sp_write_flooding_count( + page_header(mmu->root_hpa)); + + return true; + } + } + + return false; } +static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) +{ + if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); +} + +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) +{ + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush); +} +EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); + static unsigned long get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); @@ -4431,7 +4674,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; context->direct_map = false; } @@ -4461,7 +4703,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = false; } @@ -4471,20 +4712,32 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } +static union kvm_mmu_page_role +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role = {0}; + + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.ad_disabled = (shadow_accessed_mask == 0); + role.level = kvm_x86_ops->get_tdp_level(vcpu); + role.direct = true; + role.access = ACC_ALL; + + return role; +} + static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; - context->base_role.word = 0; - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); - context->base_role.ad_disabled = (shadow_accessed_mask == 0); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_tdp_mmu_root_page_role(vcpu).word; context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); - context->root_hpa = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -4519,13 +4772,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +static union kvm_mmu_page_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) { + union kvm_mmu_page_role role = {0}; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - struct kvm_mmu *context = &vcpu->arch.mmu; - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); + role.nxe = is_nx(vcpu); + role.cr4_pae = !!is_pae(vcpu); + role.cr0_wp = is_write_protection(vcpu); + role.smep_andnot_wp = smep && !is_write_protection(vcpu); + role.smap_andnot_wp = smap && !is_write_protection(vcpu); + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.direct = !is_paging(vcpu); + role.access = ACC_ALL; + + if (!is_long_mode(vcpu)) + role.level = PT32E_ROOT_LEVEL; + else if (is_la57_mode(vcpu)) + role.level = PT64_ROOT_5LEVEL; + else + role.level = PT64_ROOT_4LEVEL; + + return role; +} + +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -4536,26 +4812,34 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) else paging32_init_context(vcpu, context); - context->base_role.nxe = is_nx(vcpu); - context->base_role.cr4_pae = !!is_pae(vcpu); - context->base_role.cr0_wp = is_write_protection(vcpu); - context->base_role.smep_andnot_wp - = smep && !is_write_protection(vcpu); - context->base_role.smap_andnot_wp - = smap && !is_write_protection(vcpu); - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_shadow_mmu_root_page_role(vcpu).word; reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +static union kvm_mmu_page_role +kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) +{ + union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; + + role.level = PT64_ROOT_4LEVEL; + role.direct = false; + role.ad_disabled = !accessed_dirty; + role.guest_mode = true; + role.access = ACC_ALL; + + return role; +} + void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty) + bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.mmu; + union kvm_mmu_page_role root_page_role = + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -4566,10 +4850,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = false; - context->base_role.ad_disabled = !accessed_dirty; - context->base_role.guest_mode = 1; + context->base_role.word = root_page_role.word & mmu_base_role_mask.word; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4632,8 +4914,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_last_nonleaf_level(vcpu, g_context); } -static void init_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) { + if (reset_roots) { + uint i; + + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + } + if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4641,11 +4932,21 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu) else init_kvm_softmmu(vcpu); } +EXPORT_SYMBOL_GPL(kvm_init_mmu); + +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) +{ + if (tdp_enabled) + return kvm_calc_tdp_mmu_root_page_role(vcpu); + else + return kvm_calc_shadow_mmu_root_page_role(vcpu); +} void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -4660,8 +4961,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (r) goto out; - /* set_cr3() should ensure TLB has been flushed */ - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_load_cr3(vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); out: return r; } @@ -4669,7 +4970,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4822,16 +5123,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush; - union kvm_mmu_page_role mask = { }; - - mask.cr0_wp = 1; - mask.cr4_pae = 1; - mask.nxe = 1; - mask.smep_andnot_wp = 1; - mask.smap_andnot_wp = 1; - mask.smm = 1; - mask.guest_mode = 1; - mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4875,7 +5166,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word) && rmap_can_add(vcpu)) + & mmu_base_role_mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) remote_flush = true; @@ -5000,12 +5291,67 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + int i; + + /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + + mmu->invlpg(vcpu, gva, mmu->root_hpa); + + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. + * + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + + kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) +{ + struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool tlb_flush = false; + uint i; + + if (pcid == kvm_get_active_pcid(vcpu)) { + mmu->invlpg(vcpu, gva, mmu->root_hpa); + tlb_flush = true; + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (VALID_PAGE(mmu->prev_roots[i].hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + tlb_flush = true; + } + } + + if (tlb_flush) + kvm_x86_ops->tlb_flush_gva(vcpu, gva); + + ++vcpu->stat.invlpg; + + /* + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ +} +EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); + void kvm_enable_tdp(void) { tdp_enabled = true; @@ -5029,6 +5375,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; + if (tdp_enabled) + return 0; + /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -5047,11 +5396,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { + uint i; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + return alloc_mmu_pages(vcpu); } @@ -5059,7 +5413,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, @@ -5499,7 +5853,7 @@ int kvm_mmu_module_init(void) { int ret = -ENOMEM; - kvm_mmu_clear_all_pte_masks(); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b408c0ad612..1fab69c0b2f3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -61,9 +61,10 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty); + bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); @@ -85,6 +86,27 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } +static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) +{ + BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); + + return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) + ? cr3 & X86_CR3_PCID_MASK + : 0; +} + +static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) +{ + return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); +} + +static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) +{ + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | + kvm_get_active_pcid(vcpu)); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6288e9d7068e..14ffd973df54 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -181,7 +181,7 @@ no_present: * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK * to signify readability since it isn't used in the EPT case */ -static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT @@ -394,8 +394,8 @@ retry_walk: accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); - walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); @@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (!VALID_PAGE(root_hpa)) { WARN_ON(1); return; } spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry(vcpu, gva, iterator) { + for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; + int set_spte_ret = 0; /* direct kvm_mmu_page can not be unsync. */ BUG_ON(sp->role.direct); @@ -1002,7 +1003,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte); + pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, @@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, - PT_PAGE_TABLE_LEVEL, gfn, - spte_to_pfn(sp->spt[i]), true, false, - host_writable); + set_spte_ret |= set_spte(vcpu, &sp->spt[i], + pte_access, PT_PAGE_TABLE_LEVEL, + gfn, spte_to_pfn(sp->spt[i]), + true, false, host_writable); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) + kvm_flush_remote_tlbs(vcpu->kvm); + return nr_present; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..73e27a98456f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -5435,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) svm->asid_generation--; } +static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + invlpga(gva, svm->vmcb->control.asid); +} + static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } @@ -5766,7 +5772,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5779,8 +5784,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) /* Also sync guest cr3 here in case we live migrate */ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - - svm_flush_tlb(vcpu, true); } static int is_disabled(void) @@ -7090,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, + .tlb_flush_gva = svm_flush_tlb_gva, .run = svm_vcpu_run, .handle_exit = handle_exit, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1689f433f3a0..1519f030fd73 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -38,6 +38,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include <asm/asm.h> #include <asm/cpu.h> #include <asm/io.h> #include <asm/desc.h> @@ -188,23 +189,198 @@ module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + enum vmx_l1d_flush_state cmd; +} vmentry_l1d_param[] = { + {"auto", VMENTER_L1D_FLUSH_AUTO}, + {"never", VMENTER_L1D_FLUSH_NEVER}, + {"cond", VMENTER_L1D_FLUSH_COND}, + {"always", VMENTER_L1D_FLUSH_ALWAYS}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (sysfs_streq(s, vmentry_l1d_param[i].option)) + return vmentry_l1d_param[i].cmd; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; }; #define NR_AUTOLOAD_MSRS 8 +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + struct vmcs { - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; char data[0]; }; /* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs * loaded on this CPU (so we can clear them if the CPU goes down). @@ -215,14 +391,13 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; }; struct shared_msr_entry { @@ -253,7 +428,7 @@ struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ @@ -421,7 +596,7 @@ struct __packed vmcs12 { "Offset of " #field " in struct vmcs12 has changed.") static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); @@ -640,6 +815,12 @@ struct nested_vmx { */ struct vmcs12 *cached_vmcs12; /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 */ @@ -757,6 +938,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -784,26 +970,20 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + struct vmx_msrs guest; + struct vmx_msrs host; } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - } host_state; + struct { int vm86_active; ulong save_rflags; @@ -853,6 +1033,7 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; }; enum segment_cache_field { @@ -1072,6 +1253,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); @@ -1342,6 +1528,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) * GUEST_IA32_RTIT_CTL = 0x00002814, */ } + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + int ret; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + ret = -ENOTSUPP; + goto out; + } + + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + +out: + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -1716,6 +1944,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) CPU_BASED_MONITOR_TRAP_FLAG; } +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1796,6 +2030,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) VMX_VMFUNC_EPTP_SWITCHING); } +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1826,11 +2065,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; + bool error; - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) + : "memory"); + BUG_ON(error); } static inline void __invept(int ext, u64 eptp, gpa_t gpa) @@ -1838,11 +2078,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; + bool error; - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) + : "memory"); + BUG_ON(error); } static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) @@ -1858,12 +2099,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } @@ -1880,15 +2121,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } @@ -1966,6 +2207,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) @@ -2100,10 +2354,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) + : CC_OUT(na) (error) : "a"(value), "d"(field)); if (unlikely(error)) vmwrite_error(field, value); } @@ -2377,9 +2631,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } +static int find_msr(struct vmx_msrs *m, unsigned int msr) +{ + unsigned int i; + + for (i = 0; i < m->nr; ++i) { + if (m->val[i].index == msr) + return i; + } + return -ENOENT; +} + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { - unsigned i; + int i; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2400,18 +2665,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } + i = find_msr(&m->guest, msr); + if (i < 0) + goto skip_guest; + --m->guest.nr; + m->guest.val[i] = m->guest.val[m->guest.nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) +skip_guest: + i = find_msr(&m->host, msr); + if (i < 0) return; - --m->nr; - m->guest[i] = m->guest[m->nr]; - m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + + --m->host.nr; + m->host.val[i] = m->host.val[m->host.nr]; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -2426,9 +2694,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val) + u64 guest_val, u64 host_val, bool entry_only) { - unsigned i; + int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2463,24 +2731,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; + i = find_msr(&m->guest, msr); + if (!entry_only) + j = find_msr(&m->host, msr); - if (i == NR_AUTOLOAD_MSRS) { + if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; - } else if (i == m->nr) { - ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } + if (i < 0) { + i = m->guest.nr++; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + } + m->guest.val[i].index = msr; + m->guest.val[i].value = guest_val; - m->guest[i].index = msr; - m->guest[i].value = guest_val; - m->host[i].index = msr; - m->host[i].value = host_val; + if (entry_only) + return; + + if (j < 0) { + j = m->host.nr++; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + } + m->host.val[j].index = msr; + m->host.val[j].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) @@ -2524,7 +2799,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer &= ~EFER_LME; if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer); + guest_efer, host_efer, false); return false; } else { guest_efer &= ~ignore_bits; @@ -2566,112 +2841,150 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_save_host_state(struct kvm_vcpu *vcpu) +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; int i; - if (vmx->host_state.loaded) + if (vmx->loaded_cpu_state) return; - vmx->host_state.loaded = 1; + vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; + /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 - save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; -#else - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); -#endif - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); - vmx->host_state.fs_reload_needed = 0; + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); + if (likely(is_64bit_mm(current->mm))) { + save_fsgs_for_kvm(); + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; + fs_base = current->thread.fsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = read_msr(MSR_FS_BASE); + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } -#ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); - - vmcs_writel(HOST_FS_BASE, current->thread.fsbase); - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); - - vmx->msr_host_kernel_gs_base = current->thread.gsbase; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); #endif - if (boot_cpu_has(X86_FEATURE_MPX)) - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); + + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } + for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, vmx->guest_msrs[i].mask); } -static void __vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) + struct vmcs_host_state *host_state; + + if (!vmx->loaded_cpu_state) return; + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; + ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; + vmx->loaded_cpu_state = NULL; + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); + load_gs_index(host_state->gs_sel); #else - loadsegment(gs, vmx->host_state.gs_sel); + loadsegment(gs, host_state->gs_sel); #endif } - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { - loadsegment(ds, vmx->host_state.ds_sel); - loadsegment(es, vmx->host_state.es_sel); + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (vmx->host_state.msr_host_bndcfgs) - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); load_fixmap_gdt(raw_smp_processor_id()); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, + vmx->msr_guest_kernel_gs_base); + preempt_enable(); + } + return vmx->msr_guest_kernel_gs_base; } +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + } + vmx->msr_guest_kernel_gs_base = data; +} +#endif + static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -2813,7 +3126,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); - __vmx_load_host_state(to_vmx(vcpu)); + vmx_prepare_switch_to_host(to_vmx(vcpu)); } static bool emulation_required(struct kvm_vcpu *vcpu) @@ -3034,7 +3347,7 @@ static bool vmx_rdtscp_supported(void) static bool vmx_invpcid_supported(void) { - return cpu_has_vmx_invpcid() && enable_ept; + return cpu_has_vmx_invpcid(); } /* @@ -3277,6 +3590,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3744,8 +4063,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - msr_info->data = vmx->msr_guest_kernel_gs_base; + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: @@ -3845,8 +4163,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: @@ -3978,7 +4295,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_xss = data; if (vcpu->arch.ia32_xss != host_xss) add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss); + vcpu->arch.ia32_xss, host_xss, false); else clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; @@ -4322,11 +4639,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs_conf->revision_id = KVM_EVMCS_VERSION; - else - vmcs_conf->revision_id = vmx_msr_low; + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -4385,7 +4698,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -4396,7 +4709,15 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) return NULL; vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); - vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; + else + vmcs->hdr.revision_id = vmcs_config.revision_id; + + if (shadow) + vmcs->hdr.shadow_vmcs = 1; return vmcs; } @@ -4420,14 +4741,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(void) +static struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); } static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { - loaded_vmcs->vmcs = alloc_vmcs(); + loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; @@ -4449,6 +4770,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) evmcs->hv_enlightenments_control.msr_bitmap = 1; } } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + return 0; out_vmcs: @@ -4558,12 +4882,25 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(false, cpu); if (!vmcs) { free_kvm_area(); return -ENOMEM; } + /* + * When eVMCS is enabled, alloc_vmcs_cpu() sets + * vmcs->revision_id to KVM_EVMCS_VERSION instead of + * revision_id reported by MSR_IA32_VMX_BASIC. + * + * However, even though not explictly documented by + * TLFS, VMXArea passed as VMXON argument should + * still be marked with revision_id reported by + * physical CPU. + */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = vmcs_config.revision_id; + per_cpu(vmxarea, cpu) = vmcs; } return 0; @@ -4719,10 +5056,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return; /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in + * 64-bit mode as a 64-bit kernel may frequently access the + * MSR. This means we need to manually save/restore the MSR + * when switching between guest and host state, but only if + * the guest is in 64-bit mode. Sync our cached value if the + * guest is transitioning to 32-bit mode and the CPU contains + * guest state, i.e. the cache is stale. */ - vmx_load_host_state(to_vmx(vcpu)); +#ifdef CONFIG_X86_64 + if (!(efer & EFER_LMA)) + (void)vmx_read_guest_kernel_gs_base(vmx); +#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -4779,6 +5124,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4960,6 +5319,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; u64 eptp; @@ -4967,15 +5327,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + if (enable_unrestricted_guest || is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -5911,19 +6279,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in - * __vmx_load_host_state(), in case userspace uses the null selectors - * too (the expected case). + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); @@ -6048,8 +6416,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; - /* Enable INVPCID for non-ept guests may cause performance regression. */ - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -6178,9 +6544,6 @@ static void ept_set_mmio_spte_mask(void) */ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { -#ifdef CONFIG_X86_64 - unsigned long a; -#endif int i; if (enable_shadow_vmcs) { @@ -6235,24 +6598,17 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -6272,8 +6628,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + vmx->arch_capabilities = kvm_get_arch_capabilities(); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); @@ -7478,6 +7833,7 @@ static void vmx_enable_tdp(void) static __init int hardware_setup(void) { + unsigned long host_bndcfgs; int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7502,6 +7858,11 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7538,6 +7899,12 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; @@ -7564,6 +7931,11 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -7840,10 +8212,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) return 0; } +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); @@ -7854,25 +8251,26 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vpid02 = allocate_vpid(); + vmx->nested.vmxon = true; return 0; out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: @@ -7915,7 +8313,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* CPL=0 must be checked manually. */ if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 1; } @@ -7978,15 +8376,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - if (vmx_get_cpl(vcpu)) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); return 0; } + return 1; } @@ -8039,6 +8438,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -8124,7 +8524,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); @@ -8133,7 +8533,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - p = ((char *)(get_vmcs12(vcpu))) + offset; + p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: @@ -8155,10 +8555,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, } -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); - char *p = ((char *) get_vmcs12(vcpu)) + offset; + char *p = (char *)vmcs12 + offset; if (offset < 0) return offset; @@ -8211,7 +8611,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; field_value = __vmcs_readl(field); - vmcs12_write_any(&vmx->vcpu, field, field_value); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); } /* * Skip the VM-exit information fields if they are read-only. @@ -8246,7 +8646,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(&vmx->vcpu, field, &field_value); + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); __vmcs_writel(field, field_value); } } @@ -8276,6 +8676,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8283,10 +8684,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_vmcs12(vcpu)) return kvm_skip_emulated_instruction(vcpu); + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + } + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8328,6 +8743,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8362,23 +8778,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(vcpu, field, field_value) < 0) { + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } - switch (field) { + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { #define SHADOW_FIELD_RW(x) case x: #include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } } nested_vmx_succeed(vcpu); @@ -8429,7 +8866,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, @@ -8456,21 +8895,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t vmcs_gva; + unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; + gva_t gva; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &vmcs_gva)) + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, vmcs_gva, - (void *)&to_vmx(vcpu)->nested.current_vmptr, - sizeof(u64), &e)) { + if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } @@ -8628,6 +9066,105 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -8831,6 +9368,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; @@ -9003,6 +9541,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + /* * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we * should handle it ourselves in L0 (and then continue L2). Only call this @@ -9087,10 +9649,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* @@ -9523,6 +10090,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -9924,7 +10564,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host); + msrs[i].host, false); } static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) @@ -9978,15 +10618,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the @@ -10019,6 +10659,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? (unsigned long)¤t_evmcs->host_rsp : 0; + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -10179,9 +10822,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * - * We can't defer this to vmx_load_host_state() since that function - * may be executed in interrupt context, which saves and restore segments - * around it, nullifying its effect. + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); @@ -10242,8 +10885,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); } @@ -10346,11 +10989,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) { + if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, kvm_vcpu_apicv_active(&vmx->vcpu)); - vmx->nested.vpid02 = allocate_vpid(); - } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -10367,7 +11008,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -10381,10 +11021,39 @@ free_vcpu: return ERR_PTR(err); } +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" + static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } return 0; } @@ -10583,11 +11252,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; - kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu)); + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -10635,9 +11304,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct page *page; u64 hpa; @@ -10878,6 +11547,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -10935,11 +11636,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, unsigned long addr_field) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; u64 count, addr; - if (vmcs12_read_any(vcpu, count_field, &count) || - vmcs12_read_any(vcpu, addr_field, &addr)) { + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { WARN_ON(1); return -EINVAL; } @@ -10989,6 +11691,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -11138,12 +11853,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 1; } } - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } - kvm_mmu_reset_context(vcpu); + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + return 0; } @@ -11230,7 +11949,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set host-state according to L0's settings (vmcs12 is irrelevant here) * Some constant fields are set here by vmx_set_constant_host_state(). * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() + * is called. */ vmx_set_constant_host_state(vmx); @@ -11238,10 +11958,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set the MSR load/store lists to match L0's settings. */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); set_cr4_guest_host_mask(vmx); @@ -11302,11 +12022,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - /* - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, - * HOST_FS_BASE, HOST_GS_BASE. - */ - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); @@ -11371,6 +12086,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); @@ -11590,6 +12308,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high) || @@ -11690,6 +12411,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return 0; } +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { @@ -11701,8 +12449,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && - vmcs12->vmcs_link_pointer != -1ull) { + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } @@ -11749,13 +12496,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) +/* + * If exit_qual is NULL, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. + */ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 msr_entry_idx; - u32 exit_qual; - int r; + bool from_vmentry = !!exit_qual; + u32 dummy_exit_qual; + int r = 0; enter_guest_mode(vcpu); @@ -11769,17 +12520,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) vcpu->arch.tsc_offset += vmcs12->tsc_offset; r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) goto fail; - nested_get_vmcs12_pages(vcpu, vmcs12); + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); - r = EXIT_REASON_MSR_LOAD_FAIL; - msr_entry_idx = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) - goto fail; + r = EXIT_REASON_MSR_LOAD_FAIL; + *exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (*exit_qual) + goto fail; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11794,8 +12556,7 @@ fail: vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); - return 1; + return r; } /* @@ -11818,6 +12579,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12 = get_vmcs12(vcpu); + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) { + nested_vmx_failInvalid(vcpu); + goto out; + } + if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); @@ -11872,12 +12644,28 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. @@ -12387,6 +13175,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); @@ -12398,8 +13197,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_segment_cache_clear(vmx); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -12961,7 +13760,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, NULL); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -12976,6 +13775,199 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx->nested.current_vmptr != -1ull) { + kvm_state.size += VMCS12_SIZE; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (vmx->nested.current_vmptr == -1ull) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow vmcs linked to vmcs01, unless + * sync_shadow_vmcs is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) + sync_vmcs12(vcpu, vmcs12); + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vmx->nested.nested_run_pending = 1; + + vmx->nested.dirty_vmcs12 = true; + ret = enter_vmx_non_root_mode(vcpu, NULL); + if (ret) + return -EINVAL; + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -12995,7 +13987,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -13028,6 +14020,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@ -13110,12 +14103,61 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, }; +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + +static void vmx_exit(void) +{ +#ifdef CONFIG_KEXEC_CORE + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + + kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif + vmx_cleanup_l1d_flush(); +} +module_exit(vmx_exit); + static int __init vmx_init(void) { int r; @@ -13150,10 +14192,25 @@ static int __init vmx_init(void) #endif r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; + /* + * Must be called after kvm_init() so enable_ept is properly set + * up. Hand the parameter mitigation value in which was stored in + * the pre module init parser. If no parameter was given, it will + * contain 'auto' which will be turned into the default 'cond' + * mitigation mode. + */ + if (boot_cpu_has(X86_BUG_L1TF)) { + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; + } + } + #ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); @@ -13162,39 +14219,4 @@ static int __init vmx_init(void) return 0; } - -static void __exit vmx_exit(void) -{ -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - - kvm_exit(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif -} - -module_init(vmx_init) -module_exit(vmx_exit) +module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0046aa70205a..f7dff0457846 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, { "req_event", VCPU_STAT(req_event) }, + { "l1d_flush", VCPU_STAT(l1d_flush) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -847,16 +848,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + bool skip_tlb_flush = false; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - if (pcid_enabled) - cr3 &= ~CR3_PCID_INVD; + if (pcid_enabled) { + skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; + cr3 &= ~X86_CR3_PCID_NOFLUSH; + } #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!skip_tlb_flush) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } return 0; } @@ -867,9 +873,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - kvm_mmu_new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -1097,15 +1104,41 @@ static u32 msr_based_features[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, }; static unsigned int num_msr_based_features; +u64 kvm_get_arch_capabilities(void) +{ + u64 data; + + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. + * If an outer hypervisor is doing the cache flush for us + * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * capability to the guest too, and if EPT is disabled we're not + * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will + * require a nested hypervisor to do a flush of its own. + */ + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + + return data; +} +EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); + static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { + case MSR_IA32_ARCH_CAPABILITIES: + msr->data = kvm_get_arch_capabilities(); + break; case MSR_IA32_UCODE_REV: - rdmsrl(msr->index, msr->data); + rdmsrl_safe(msr->index, &msr->data); break; default: if (kvm_x86_ops->get_msr_feature(msr)) @@ -2158,10 +2191,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && + (data || !msr_info->host_initiated)) return 1; if (data != 0 && data != ~(u64)0) - return -1; + return 1; vcpu->arch.mcg_ctl = data; break; default: @@ -2549,7 +2583,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } EXPORT_SYMBOL_GPL(kvm_get_msr); -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2564,7 +2598,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.mcg_cap; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && !host) return 1; data = vcpu->arch.mcg_ctl; break; @@ -2697,7 +2731,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data, + msr_info->host_initiated); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2718,7 +2753,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data); + msr_info->index, &msr_info->data, + msr_info->host_initiated); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2942,6 +2978,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; + case KVM_CAP_NESTED_STATE: + r = kvm_x86_ops->get_nested_state ? + kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + break; default: break; } @@ -3958,6 +3998,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_GET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + u32 user_data_size; + + r = -EINVAL; + if (!kvm_x86_ops->get_nested_state) + break; + + BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + if (get_user(user_data_size, &user_kvm_nested_state->size)) + return -EFAULT; + + r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + user_data_size); + if (r < 0) + return r; + + if (r > user_data_size) { + if (put_user(r, &user_kvm_nested_state->size)) + return -EFAULT; + return -E2BIG; + } + r = 0; + break; + } + case KVM_SET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + struct kvm_nested_state kvm_state; + + r = -EINVAL; + if (!kvm_x86_ops->set_nested_state) + break; + + if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) + return -EFAULT; + + if (kvm_state.size < sizeof(kvm_state)) + return -EINVAL; + + if (kvm_state.flags & + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + /* nested_run_pending implies guest_mode. */ + if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + return -EINVAL; + + r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + break; + } default: r = -EINVAL; } @@ -4874,6 +4964,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + /* kvm_write_guest_virt_system can pull in tons of pages. */ + vcpu->arch.l1tf_flush_l1d = true; + return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -6050,6 +6143,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + vcpu->arch.l1tf_flush_l1d = true; + /* * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. @@ -6471,8 +6566,12 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - /* Mask the reserved physical address bits. */ - mask = rsvd_bits(maxphyaddr, 51); + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; /* Set the present bit. */ mask |= 1ull; @@ -6737,6 +6836,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; + case KVM_HC_SEND_IPI: + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; #endif default: ret = -KVM_ENOSYS; @@ -7255,6 +7357,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) + kvm_x86_ops->get_vmcs12_pages(vcpu); if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -7270,6 +7374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) + kvm_mmu_load_cr3(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@ -7579,6 +7685,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->arch.l1tf_flush_l1d = true; for (;;) { if (kvm_vcpu_running(vcpu)) { @@ -7980,6 +8087,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8010,10 +8121,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) struct desc_ptr dt; int ret = -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - goto out; - if (kvm_valid_sregs(vcpu, sregs)) goto out; @@ -8698,6 +8805,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + vcpu->arch.l1tf_flush_l1d = true; kvm_x86_ops->sched_in(vcpu, cpu); } diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 298ef1479240..3b24dc05251c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe) /* Copy successful. Return zero */ .L_done_memcpy_trap: - xorq %rax, %rax + xorl %eax, %eax ret ENDPROC(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2f3c9196b834..a12afff146d1 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = { [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD ((pgd_t *) &init_top_pgt) + #else /* CONFIG_X86_64 */ enum address_markers_idx { @@ -121,6 +123,9 @@ enum address_markers_idx { #ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, +#endif CPU_ENTRY_AREA_NR, FIXADDR_START_NR, END_OF_SPACE_NR, @@ -134,11 +139,16 @@ static struct addr_marker address_markers[] = { #ifdef CONFIG_HIGHMEM [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, +#endif [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD (swapper_pg_dir) + #endif /* !CONFIG_X86_64 */ /* Multipliers for offsets within the PTEs */ @@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx) static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx, bool dmesg) { -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_top_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif + pgd_t *start = INIT_PGD; pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) } EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); -static void ptdump_walk_user_pgd_level_checkwx(void) +void ptdump_walk_user_pgd_level_checkwx(void) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - pgd_t *pgd = (pgd_t *) &init_top_pgt; + pgd_t *pgd = INIT_PGD; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!(__supported_pte_mask & _PAGE_NX) || + !static_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); @@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { ptdump_walk_pgd_level_core(NULL, NULL, true, false); - ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) @@ -609,6 +615,9 @@ static int __init pt_dump_init(void) # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; +# ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +# endif #endif return 0; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2aafa6ab6103..b9123c497e0a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ @@ -317,8 +318,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -1001,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, unsigned int fault) + unsigned long address, u32 *pkey, vm_fault_t fault) { if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1215,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault, major = 0; + vm_fault_t fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; u32 pkey; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cee58a972cb2..acfab322fbe0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -4,6 +4,8 @@ #include <linux/swap.h> #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/swapfile.h> +#include <linux/swapops.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -773,13 +775,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(void *begin, void *end) +{ + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + + free_init_pages("unused kernel image", begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); +} + void __ref free_initmem(void) { e820__reallocate_tables(); - free_init_pages("unused kernel", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + free_kernel_image_pages(&__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD @@ -880,3 +913,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } + +#ifdef CONFIG_SWAP +unsigned long max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + unsigned long l1tf_limit = l1tf_pfn_limit() + 1; + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#if CONFIG_PGTABLE_LEVELS > 2 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long, l1tf_limit, pages); + } + return pages; +} +#endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a688617c727e..dd519f372169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1283,20 +1283,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(text_end)), - (unsigned long) __va(__pa_symbol(rodata_start))); - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(rodata_end)), - (unsigned long) __va(__pa_symbol(_sdata))); + free_kernel_image_pages((void *)text_end, (void *)rodata_start); + free_kernel_image_pages((void *)rodata_end, (void *)_sdata); debug_checkwx(); - - /* - * Do this after all of the manipulation of the - * kernel text page tables are complete. - */ - pti_clone_kernel_text(); } int kern_addr_valid(unsigned long addr) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 7c8686709636..79eb55ce69a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { + pmd_t new_pmd; pmdval_t v = pmd_val(*pmd); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pmd(pmd, __pmd(v)); + *old = v; + new_pmd = pmd_mknotpresent(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); } static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pte_atomic(pte, __pte(v)); + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } } static int clear_page_presence(struct kmmio_fault_page *f, bool clear) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 48c591251600..f40ab8185d94 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) return phys_addr_valid(addr + count - 1); } + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 34a2a3bfde9c..b54d52a2d00a 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -61,7 +61,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = nid; + emu_nid_to_phys[nid] = pb->nid; pb->start += size; if (pb->start >= pb->end) { @@ -198,40 +198,73 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) return end; } +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from * `addr' to `max_addr'. * * Returns zero on success or negative on error. */ -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) { nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; u64 min_size; - int nid = 0; - int i, ret; - if (!size) + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) return -1; + /* - * The limit on emulated nodes is MAX_NUMNODES, so the size per node is - * increased accordingly if the requested size is too small. This - * creates a uniform distribution of node sizes across the entire - * machine (but not necessarily over physical nodes). + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. */ - min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; - min_size = max(min_size, FAKE_NODE_MIN_SIZE); - if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) - min_size = (min_size + FAKE_NODE_MIN_SIZE) & - FAKE_NODE_MIN_HASH_MASK; + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); if (size < min_size) { pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", size >> 20, min_size >> 20); size = min_size; } - size &= FAKE_NODE_MIN_HASH_MASK; + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); /* * Fill physical nodes with fake nodes of size until there is no memory @@ -248,10 +281,14 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, node_clear(i, physnode_mask); continue; } + start = pi->blk[phys_blk].start; limit = pi->blk[phys_blk].end; - end = find_end_of_node(start, limit, size); + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); /* * If there won't be at least FAKE_NODE_MIN_SIZE of * non-reserved memory in ZONE_DMA32 for the next node, @@ -266,7 +303,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - mem_hole_size(end, limit) < size) + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, @@ -276,7 +314,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return ret; } } - return 0; + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, NUMA_NO_NODE); } int __init setup_emu2phys_nid(int *dfl_phys_nid) @@ -346,7 +392,28 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * the fixed node size. Otherwise, if it is just a single number N, * split the system RAM into N fake nodes. */ - if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[i].start, pi.blk[i].end, 0, + n, &pi.blk[i], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { u64 size; size = memparse(emu_cmdline, &emu_cmdline); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3bded76e8d5c..8d6c34fe49be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -1014,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pmd_pgprot))); + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; @@ -1087,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pud_pgprot))); + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; @@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + /* Has caller explicitly disabled alias checking? */ + if (in_flag & CPA_NO_CHECK_ALIAS) + checkalias = 0; ret = __change_page_attr_set_clr(&cpa, checkalias); @@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + int cpa_flags = CPA_NO_CHECK_ALIAS; + + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 47b5951e592b..3ef095c70ae3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd) */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +/* + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. + */ +#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) + void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); @@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 - +#define PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ -static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) + for (i = 0; i < count; i++) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); @@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) } } -static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; @@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - for(i = 0; i < PREALLOCATED_PMDS; i++) { + for (i = 0; i < count; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(gfp); if (!pmd) failed = true; @@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) } if (failed) { - free_pmds(mm, pmds); + free_pmds(mm, pmds, count); return -ENOMEM; } @@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) * preallocate which never got a corresponding vma will need to be * freed manually. */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + *pgdp = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) { - pgd_t pgd = pgdp[i]; + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); +#ifdef CONFIG_PAGE_TABLE_ISOLATION - pgdp[i] = native_make_pgd(0); + if (!static_cpu_has(X86_FEATURE_PTI)) + return; - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - mm_dec_nr_pmds(mm); - } - } + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) @@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. @@ -329,9 +384,6 @@ static int __init pgd_cache_init(void) */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); - if (!pgd_cache) - return -ENOMEM; - return 0; } core_initcall(pgd_cache_init); @@ -343,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, + PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -355,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void) static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } @@ -375,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; + pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(); @@ -384,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(mm, pmds) != 0) + if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (paravirt_pgd_alloc(mm) != 0) + if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they @@ -399,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; +out_free_user_pmds: + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(mm, pmds); + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: @@ -719,28 +779,50 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } +#ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. * @pud: Pointer to a PUD. + * @addr: Virtual address associated with pud. * - * Context: The pud range has been unmaped and TLB purged. + * Context: The pud range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. + * + * NOTE: Callers must allow a single page allocation. */ -int pud_free_pmd_page(pud_t *pud) +int pud_free_pmd_page(pud_t *pud, unsigned long addr) { - pmd_t *pmd; + pmd_t *pmd, *pmd_sv; + pte_t *pte; int i; if (pud_none(*pud)) return 1; pmd = (pmd_t *)pud_page_vaddr(*pud); + pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); + if (!pmd_sv) + return 0; - for (i = 0; i < PTRS_PER_PMD; i++) - if (!pmd_free_pte_page(&pmd[i])) - return 0; + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_sv[i] = pmd[i]; + if (!pmd_none(pmd[i])) + pmd_clear(&pmd[i]); + } pud_clear(pud); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd_sv[i])) { + pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); + free_page((unsigned long)pte); + } + } + + free_page((unsigned long)pmd_sv); free_page((unsigned long)pmd); return 1; @@ -749,11 +831,12 @@ int pud_free_pmd_page(pud_t *pud) /** * pmd_free_pte_page - Clear pmd entry and free pte page. * @pmd: Pointer to a PMD. + * @addr: Virtual address associated with pmd. * - * Context: The pmd range has been unmaped and TLB purged. + * Context: The pmd range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ -int pmd_free_pte_page(pmd_t *pmd) +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; @@ -762,8 +845,30 @@ int pmd_free_pte_page(pmd_t *pmd) pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + free_page((unsigned long)pte); return 1; } + +#else /* !CONFIG_X86_64 */ + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + return pud_none(*pud); +} + +/* + * Disable free page handling on x86-PAE. This assures that ioremap() + * does not update sync'd pmd entries. See vmalloc_sync_one(). + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + return pmd_none(*pmd); +} + +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4d418e705878..31341ae7309f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -45,6 +45,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/sections.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt @@ -54,6 +55,16 @@ #define __GFP_NOTRACK 0 #endif +/* + * Define the page-table levels we clone for user-space on 32 + * and 64 bit. + */ +#ifdef CONFIG_X86_64 +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD +#else +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE +#endif + static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) @@ -117,7 +128,7 @@ enable: setup_force_cpu_cap(X86_FEATURE_PTI); } -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page @@ -176,7 +187,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); - if (!new_p4d_page) + if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); @@ -195,13 +206,17 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + p4d_t *p4d; pud_t *pud; + p4d = pti_user_pagetable_walk_p4d(address); + if (!p4d) + return NULL; + BUILD_BUG_ON(p4d_large(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); - if (!new_pud_page) + if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); @@ -215,7 +230,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); - if (!new_pmd_page) + if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); @@ -224,7 +239,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) return pmd_offset(pud, address); } -#ifdef CONFIG_X86_VSYSCALL_EMULATION /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. @@ -237,9 +251,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pmd_t *pmd; pte_t *pte; + pmd = pti_user_pagetable_walk_pmd(address); + if (!pmd) + return NULL; + /* We can't do anything sensible if we hit a large mapping. */ if (pmd_large(*pmd)) { WARN_ON(1); @@ -262,6 +280,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) return pte; } +#ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; @@ -282,8 +301,14 @@ static void __init pti_setup_vsyscall(void) static void __init pti_setup_vsyscall(void) { } #endif +enum pti_clone_level { + PTI_CLONE_PMD, + PTI_CLONE_PTE, +}; + static void -pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +pti_clone_pgtable(unsigned long start, unsigned long end, + enum pti_clone_level level) { unsigned long addr; @@ -291,59 +316,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ - for (addr = start; addr < end; addr += PMD_SIZE) { + for (addr = start; addr < end;) { + pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; + /* Overflow check */ + if (addr < start) + break; + pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; + pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (pud_none(*pud)) { + addr += PUD_SIZE; continue; + } + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (pmd_none(*pmd)) { + addr += PMD_SIZE; continue; + } - target_pmd = pti_user_pagetable_walk_pmd(addr); - if (WARN_ON(!target_pmd)) - return; - - /* - * Only clone present PMDs. This ensures only setting - * _PAGE_GLOBAL on present PMDs. This should only be - * called on well-known addresses anyway, so a non- - * present PMD would be a surprise. - */ - if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) - return; - - /* - * Setting 'target_pmd' below creates a mapping in both - * the user and kernel page tables. It is effectively - * global, so set it as global in both copies. Note: - * the X86_FEATURE_PGE check is not _required_ because - * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with - * code that only set this bit when supported. - */ - if (boot_cpu_has(X86_FEATURE_PGE)) - *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); - - /* - * Copy the PMD. That is, the kernelmode and usermode - * tables will share the last-level page tables of this - * address range - */ - *target_pmd = pmd_clear_flags(*pmd, clear); + if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; + + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = *pmd; + + addr += PMD_SIZE; + + } else if (level == PTI_CLONE_PTE) { + + /* Walk the page-table down to the pte level */ + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + addr += PAGE_SIZE; + continue; + } + + /* Only clone present PTEs */ + if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) + return; + + /* Allocate PTE in the user page-table */ + target_pte = pti_user_pagetable_walk_pte(addr); + if (WARN_ON(!target_pte)) + return; + + /* Set GLOBAL bit in both PTEs */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pte = pte_set_flags(*pte, _PAGE_GLOBAL); + + /* Clone the PTE */ + *target_pte = *pte; + + addr += PAGE_SIZE; + + } else { + BUG(); + } } } +#ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. @@ -354,6 +425,9 @@ static void __init pti_clone_p4d(unsigned long addr) pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); + if (!user_p4d) + return; + kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; @@ -367,6 +441,25 @@ static void __init pti_clone_user_shared(void) pti_clone_p4d(CPU_ENTRY_AREA_BASE); } +#else /* CONFIG_X86_64 */ + +/* + * On 32 bit PAE systems with 1GB of Kernel address space there is only + * one pgd/p4d for the whole kernel. Cloning that would map the whole + * address space into the user page-tables, making PTI useless. So clone + * the page-table on the PMD level to prevent that. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned long start, end; + + start = CPU_ENTRY_AREA_BASE; + end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); + + pti_clone_pgtable(start, end, PTI_CLONE_PMD); +} +#endif /* CONFIG_X86_64 */ + /* * Clone the ESPFIX P4D into the user space visible page table */ @@ -380,11 +473,11 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry and irqentry text and force it RO. */ -static void __init pti_clone_entry_text(void) +static void pti_clone_entry_text(void) { - pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, - _PAGE_RW); + pti_clone_pgtable((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, + PTI_CLONE_PMD); } /* @@ -435,10 +528,17 @@ static inline bool pti_kernel_image_global_ok(void) } /* + * This is the only user for these and it is not arch-generic + * like the other set_memory.h functions. Just extern them. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +extern int set_memory_global(unsigned long addr, int numpages); + +/* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ -void pti_clone_kernel_text(void) +static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally @@ -446,7 +546,8 @@ void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long end_clone = (unsigned long)__end_rodata_aligned; + unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); if (!pti_kernel_image_global_ok()) return; @@ -458,14 +559,18 @@ void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end, _PAGE_RW); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + + /* + * pti_clone_pgtable() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } -/* - * This is the only user for it and it is not arch-generic like - * the other set_memory.h functions. Just extern it. - */ -extern int set_memory_nonglobal(unsigned long addr, int numpages); void pti_set_kernel_image_nonglobal(void) { /* @@ -477,9 +582,11 @@ void pti_set_kernel_image_nonglobal(void) unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); - if (pti_kernel_image_global_ok()) - return; - + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } @@ -493,6 +600,28 @@ void __init pti_init(void) pr_info("enabled\n"); +#ifdef CONFIG_X86_32 + /* + * We check for X86_FEATURE_PCID here. But the init-code will + * clear the feature flag on 32 bit because the feature is not + * supported on 32 bit anyway. To print the warning we need to + * check with cpuid directly again. + */ + if (cpuid_ecx(0x1) & BIT(17)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "************************************************************\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "************************************************************\n"); + } +#endif + pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ @@ -502,3 +631,22 @@ void __init pti_init(void) pti_setup_espfix64(); pti_setup_vsyscall(); } + +/* + * Finalize the kernel mappings in the userspace page-table. Some of the + * mappings for the kernel image might have changed since pti_init() + * cloned them. This is because parts of the kernel image have been + * mapped RO and/or NX. These changes need to be cloned again to the + * userspace page-table. + */ +void pti_finalize(void) +{ + /* + * We need to clone everything (again) that maps parts of the + * kernel image. + */ + pti_clone_entry_text(); + pti_clone_kernel_text(); + + debug_checkwx_user(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6eb1f34c3c85..752dbf4e0e50 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> +#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -35,7 +36,7 @@ * necessary invalidation by clearing out the 'ctx_id' which * forces a TLB flush when the context is loaded. */ -void clear_asid_other(void) +static void clear_asid_other(void) { u16 asid; @@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * We don't currently support having a real mm loaded without - * our cpu set in mm_cpumask(). We have all the bookkeeping - * in place to figure out whether we would need to flush - * if our cpu were cleared in mm_cpumask(), but we don't - * currently use it. + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, sync_current_stack_to_mm(next); } - /* Stop remote flushes for the previous mm */ - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + /* + * Stop remote flushes for the previous mm. + * Skip kernel threads; we never send init_mm TLB flushing IPIs, + * but the bitmap manipulation can cause cache line contention. + */ + if (real_prev != &init_mm) { + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, + mm_cpumask(real_prev))); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + } /* * Start remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); + if (next != &init_mm) + cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* - * There's a significant optimization that may be possible - * here. We have accurate enough TLB flush tracking that we - * don't need to maintain coherence of TLB per se when we're - * lazy. We do, however, need to maintain coherence of - * paging-structure caches. We could, in principle, leave our - * old mm loaded and only switch to init_mm when - * tlb_remove_page() happens. - */ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /* @@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { + cpumask_var_t lazymask; + unsigned int cpu; + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ - unsigned int cpu; - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) @@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func_remote, + + /* + * A temporary cpumask is used in order to skip sending IPIs + * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). + * If the allocation fails, simply IPI every CPU in mm_cpumask. + */ + if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); + return; + } + + cpumask_copy(lazymask, cpumask); + + for_each_cpu(cpu, lazymask) { + if (per_cpu(cpu_tlbstate.is_lazy, cpu)) + cpumask_clear_cpu(cpu, lazymask); + } + + smp_call_function_many(lazymask, flush_tlb_func_remote, + (void *)info, 1); + + free_cpumask_var(lazymask); } /* @@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } +void tlb_flush_remove_tables_local(void *arg) +{ + struct mm_struct *mm = arg; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && + this_cpu_read(cpu_tlbstate.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); + } +} + +static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, + struct cpumask *lazy_cpus) +{ + int cpu; + + for_each_cpu(cpu, mm_cpumask(mm)) { + if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) + cpumask_set_cpu(cpu, lazy_cpus); + } +} + +void tlb_flush_remove_tables(struct mm_struct *mm) +{ + int cpu = get_cpu(); + cpumask_var_t lazy_cpus; + + if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { + put_cpu(); + return; + } + + if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { + /* + * If the cpumask allocation fails, do a brute force flush + * on all the CPUs that have this mm loaded. + */ + smp_call_function_many(mm_cpumask(mm), + tlb_flush_remove_tables_local, (void *)mm, 1); + put_cpu(); + return; + } + + /* + * CPUs with !is_lazy either received a TLB flush IPI while the user + * pages in this address range were unmapped, or have context switched + * and reloaded %CR3 since then. + * + * Shootdown IPIs at page table freeing time only need to be sent to + * CPUs that may have out of date TLB contents. + */ + mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); + smp_call_function_many(lazy_cpus, + tlb_flush_remove_tables_local, (void *)mm, 1); + free_cpumask_var(lazy_cpus); + put_cpu(); +} static void do_flush_tlb_all(void *info) { diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 55799873ebe5..8f6cc71e0848 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1441,8 +1441,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* sub esp,STACK_SIZE */ EMIT2_off32(0x81, 0xEC, STACK_SIZE); - /* sub ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16); + /* sub ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 12); /* xor ebx,ebx */ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX)); @@ -1475,8 +1475,8 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) /* mov edx,dword ptr [ebp+off]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1])); - /* add ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16); + /* add ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 12); /* mov ebx,dword ptr [ebp-12]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 563049c483a1..d4ec117c1142 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -22,7 +22,6 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; -unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; @@ -599,9 +598,6 @@ char *__init pcibios_setup(char *str) pci_probe |= PCI_BIG_ROOT_WINDOW; return NULL; #endif - } else if (!strcmp(str, "earlydump")) { - pci_early_dump_regs = 1; - return NULL; } else if (!strcmp(str, "routeirq")) { pci_routeirq = 1; return NULL; diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index e5f753cbb1c3..f5fc953e5848 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -57,47 +57,3 @@ int early_pci_allowed(void) PCI_PROBE_CONF1; } -void early_dump_pci_device(u8 bus, u8 slot, u8 func) -{ - u32 value[256 / 4]; - int i; - - pr_info("pci 0000:%02x:%02x.%d config space:\n", bus, slot, func); - - for (i = 0; i < 256; i += 4) - value[i / 4] = read_pci_config(bus, slot, func, i); - - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_OFFSET, 16, 1, value, 256, false); -} - -void early_dump_pci_devices(void) -{ - unsigned bus, slot, func; - - if (!early_pci_allowed()) - return; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u8 type; - - class = read_pci_config(bus, slot, func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - continue; - - early_dump_pci_device(bus, slot, func); - - if (func == 0) { - type = read_pci_config_byte(bus, slot, - func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } - } -} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77873ce700ae..ee5d08f25ce4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -417,7 +417,7 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; - if (sev_active()) + if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO) flags |= _PAGE_ENC; pfn = md->phys_addr >> PAGE_SHIFT; @@ -636,6 +636,8 @@ void efi_switch_mm(struct mm_struct *mm) #ifdef CONFIG_EFI_MIXED extern efi_status_t efi64_thunk(u32, ...); +static DEFINE_SPINLOCK(efi_runtime_lock); + #define runtime_service32(func) \ ({ \ u32 table = (u32)(unsigned long)efi.systab; \ @@ -657,17 +659,14 @@ extern efi_status_t efi64_thunk(u32, ...); #define efi_thunk(f, ...) \ ({ \ efi_status_t __s; \ - unsigned long __flags; \ u32 __func; \ \ - local_irq_save(__flags); \ arch_efi_call_virt_setup(); \ \ __func = runtime_service32(f); \ __s = efi64_thunk(__func, __VA_ARGS__); \ \ arch_efi_call_virt_teardown(); \ - local_irq_restore(__flags); \ \ __s; \ }) @@ -702,14 +701,17 @@ static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc) { efi_status_t status; u32 phys_tm, phys_tc; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); phys_tc = virt_to_phys_or_null(tc); status = efi_thunk(get_time, phys_tm, phys_tc); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -719,13 +721,16 @@ static efi_status_t efi_thunk_set_time(efi_time_t *tm) { efi_status_t status; u32 phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_time, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -737,8 +742,10 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, { efi_status_t status; u32 phys_enabled, phys_pending, phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_enabled = virt_to_phys_or_null(enabled); phys_pending = virt_to_phys_or_null(pending); @@ -747,6 +754,7 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, status = efi_thunk(get_wakeup_time, phys_enabled, phys_pending, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -757,13 +765,16 @@ efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { efi_status_t status; u32 phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_wakeup_time, enabled, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -781,6 +792,9 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, efi_status_t status; u32 phys_name, phys_vendor, phys_attr; u32 phys_data_size, phys_data; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_data_size = virt_to_phys_or_null(data_size); phys_vendor = virt_to_phys_or_null(vendor); @@ -791,6 +805,8 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, status = efi_thunk(get_variable, phys_name, phys_vendor, phys_attr, phys_data_size, phys_data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -800,6 +816,34 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, { u32 phys_name, phys_vendor, phys_data; efi_status_t status; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); + + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); + phys_vendor = virt_to_phys_or_null(vendor); + phys_data = virt_to_phys_or_null_size(data, data_size); + + /* If data_size is > sizeof(u32) we've got problems */ + status = efi_thunk(set_variable, phys_name, phys_vendor, + attr, data_size, phys_data); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); + + return status; +} + +static efi_status_t +efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + u32 phys_name, phys_vendor, phys_data; + efi_status_t status; + unsigned long flags; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); phys_vendor = virt_to_phys_or_null(vendor); @@ -809,6 +853,8 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, status = efi_thunk(set_variable, phys_name, phys_vendor, attr, data_size, phys_data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -819,6 +865,9 @@ efi_thunk_get_next_variable(unsigned long *name_size, { efi_status_t status; u32 phys_name_size, phys_name, phys_vendor; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_name_size = virt_to_phys_or_null(name_size); phys_vendor = virt_to_phys_or_null(vendor); @@ -827,6 +876,8 @@ efi_thunk_get_next_variable(unsigned long *name_size, status = efi_thunk(get_next_variable, phys_name_size, phys_name, phys_vendor); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -835,10 +886,15 @@ efi_thunk_get_next_high_mono_count(u32 *count) { efi_status_t status; u32 phys_count; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_count = virt_to_phys_or_null(count); status = efi_thunk(get_next_high_mono_count, phys_count); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -847,10 +903,15 @@ efi_thunk_reset_system(int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data) { u32 phys_data; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_data = virt_to_phys_or_null_size(data, data_size); efi_thunk(reset_system, reset_type, status, data_size, phys_data); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); } static efi_status_t @@ -872,10 +933,40 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space, { efi_status_t status; u32 phys_storage, phys_remaining, phys_max; + unsigned long flags; + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + spin_lock_irqsave(&efi_runtime_lock, flags); + + phys_storage = virt_to_phys_or_null(storage_space); + phys_remaining = virt_to_phys_or_null(remaining_space); + phys_max = virt_to_phys_or_null(max_variable_size); + + status = efi_thunk(query_variable_info, attr, phys_storage, + phys_remaining, phys_max); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); + + return status; +} + +static efi_status_t +efi_thunk_query_variable_info_nonblocking(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + efi_status_t status; + u32 phys_storage, phys_remaining, phys_max; + unsigned long flags; if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + phys_storage = virt_to_phys_or_null(storage_space); phys_remaining = virt_to_phys_or_null(remaining_space); phys_max = virt_to_phys_or_null(max_variable_size); @@ -883,6 +974,8 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space, status = efi_thunk(query_variable_info, attr, phys_storage, phys_remaining, phys_max); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -908,9 +1001,11 @@ void efi_thunk_runtime_setup(void) efi.get_variable = efi_thunk_get_variable; efi.get_next_variable = efi_thunk_get_next_variable; efi.set_variable = efi_thunk_set_variable; + efi.set_variable_nonblocking = efi_thunk_set_variable_nonblocking; efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count; efi.reset_system = efi_thunk_reset_system; efi.query_variable_info = efi_thunk_query_variable_info; + efi.query_variable_info_nonblocking = efi_thunk_query_variable_info_nonblocking; efi.update_capsule = efi_thunk_update_capsule; efi.query_capsule_caps = efi_thunk_query_capsule_caps; } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 36c1f8b9f7e0..844d31cb8a0c 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -105,12 +105,11 @@ early_param("efi_no_storage_paranoia", setup_storage_paranoia); */ void efi_delete_dummy_variable(void) { - efi.set_variable((efi_char16_t *)efi_dummy_name, - &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi.set_variable_nonblocking((efi_char16_t *)efi_dummy_name, + &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); } /* @@ -249,7 +248,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) int num_entries; void *new; - if (efi_mem_desc_lookup(addr, &md)) { + if (efi_mem_desc_lookup(addr, &md) || + md.type != EFI_BOOT_SERVICES_DATA) { pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr); return; } diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index fa021dfab088..5cf886c867c2 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfld.o pwr.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 4f5fa65a1011..2acd6be13375 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -18,6 +18,7 @@ #include <asm/intel-mid.h> #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> +#include <asm/hw_irq.h> #define TANGIER_EXT_TIMER0_MSI 12 diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2ebdf31d9996..56f66eafb94f 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -36,8 +36,6 @@ #include <asm/apb_timer.h> #include <asm/reboot.h> -#include "intel_mid_weak_decls.h" - /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, * cmdline option x86_intel_mid_timer can be used to override the configuration @@ -61,10 +59,6 @@ enum intel_mid_timer_options intel_mid_timer_options; -/* intel_mid_ops to store sub arch ops */ -static struct intel_mid_ops *intel_mid_ops; -/* getter function for sub arch ops*/ -static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); @@ -82,11 +76,6 @@ static void intel_mid_reboot(void) intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); } -static unsigned long __init intel_mid_calibrate_tsc(void) -{ - return 0; -} - static void __init intel_mid_setup_bp_timer(void) { apbt_time_init(); @@ -133,6 +122,7 @@ static void intel_mid_arch_setup(void) case 0x3C: case 0x4A: __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER; + x86_platform.legacy.rtc = 1; break; case 0x27: default: @@ -140,17 +130,7 @@ static void intel_mid_arch_setup(void) break; } - if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops)) - intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); - else { - intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Unknown SoC, assuming Penwell!\n"); - } - out: - if (intel_mid_ops->arch_setup) - intel_mid_ops->arch_setup(); - /* * Intel MID platforms are using explicitly defined regulators. * @@ -191,7 +171,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.arch_init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h deleted file mode 100644 index 3c1c3866d82b..000000000000 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * intel_mid_weak_decls.h: Weak declarations of intel-mid.c - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - - -/* For every CPU addition a new get_<cpuname>_ops interface needs - * to be added. - */ -extern void *get_penwell_ops(void); -extern void *get_cloverview_ops(void); -extern void *get_tangier_ops(void); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c deleted file mode 100644 index e42978d4deaf..000000000000 --- a/arch/x86/platform/intel-mid/mfld.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * mfld.c: Intel Medfield platform setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> -#include <asm/intel_mid_vrtc.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init mfld_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = FSB_FREQ_83SKU; - else - fsb = FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init penwell_arch_setup(void) -{ - x86_platform.calibrate_tsc = mfld_calibrate_tsc; -} - -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -void *get_penwell_ops(void) -{ - return &penwell_ops; -} - -void *get_cloverview_ops(void) -{ - return &penwell_ops; -} diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c deleted file mode 100644 index ae7bdeb0e507..000000000000 --- a/arch/x86/platform/intel-mid/mrfld.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Intel Merrifield platform specific setup code - * - * (C) Copyright 2013 Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/init.h> - -#include <asm/apic.h> -#include <asm/intel-mid.h> - -#include "intel_mid_weak_decls.h" - -static unsigned long __init tangier_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb, bus_freq; - - /* *********************** */ - /* Compute TSC:Ratio * FSB */ - /* *********************** */ - - /* Compute Ratio */ - rdmsr(MSR_PLATFORM_INFO, lo, hi); - pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo); - - ratio = (lo >> 8) & 0xFF; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("Read a zero ratio, force tsc ratio to 4 ...\n"); - ratio = 4; - } - - /* Compute FSB */ - rdmsr(MSR_FSB_FREQ, lo, hi); - pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n", - hi, lo); - - bus_freq = lo & 0x7; - pr_debug("bus_freq = 0x%x\n", bus_freq); - - if (bus_freq == 0) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 1) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 2) - fsb = FSB_FREQ_133SKU; - else if (bus_freq == 3) - fsb = FSB_FREQ_167SKU; - else if (bus_freq == 4) - fsb = FSB_FREQ_83SKU; - else if (bus_freq == 5) - fsb = FSB_FREQ_400SKU; - else if (bus_freq == 6) - fsb = FSB_FREQ_267SKU; - else if (bus_freq == 7) - fsb = FSB_FREQ_333SKU; - else { - BUG(); - pr_err("Invalid bus_freq! Setting to minimal value!\n"); - fsb = FSB_FREQ_100SKU; - } - - /* TSC = FSB Freq * Resolved HFM Ratio */ - fast_calibrate = ratio * fsb; - pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate); - - /* ************************************ */ - /* Calculate Local APIC Timer Frequency */ - /* ************************************ */ - lapic_timer_frequency = (fsb * 1000) / HZ; - - pr_debug("Setting lapic_timer_frequency = %d\n", - lapic_timer_frequency); - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - -static void __init tangier_arch_setup(void) -{ - x86_platform.calibrate_tsc = tangier_calibrate_tsc; - x86_platform.legacy.rtc = 1; -} - -/* tangier arch ops */ -static struct intel_mid_ops tangier_ops = { - .arch_setup = tangier_arch_setup, -}; - -void *get_tangier_ops(void) -{ - return &tangier_ops; -} diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 7c3077e58fa0..f0e920fb98ad 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -311,10 +311,8 @@ static int __init add_xo1_platform_devices(void) return PTR_ERR(pdev); pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - return 0; + return PTR_ERR_OR_ZERO(pdev); } static int olpc_xo1_ec_probe(struct platform_device *pdev) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index ca446da48fd2..a4130b84d1ff 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct msg_desc msgdesc; ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); @@ -1607,8 +1608,6 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr, *tunables[cnt].tunp = val; continue; } - if (q == p) - break; } return 0; } diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 67ccf64c8bd8..f8e3b668d20b 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -233,29 +233,35 @@ struct restore_data_record { */ static int get_e820_md5(struct e820_table *table, void *buf) { - struct scatterlist sg; - struct crypto_ahash *tfm; + struct crypto_shash *tfm; + struct shash_desc *desc; int size; int ret = 0; - tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(tfm)) return -ENOMEM; - { - AHASH_REQUEST_ON_STACK(req, tfm); - size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; - ahash_request_set_tfm(req, tfm); - sg_init_one(&sg, (u8 *)table, size); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, &sg, buf, size); - - if (crypto_ahash_digest(req)) - ret = -EINVAL; - ahash_request_zero(req); + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto free_tfm; } - crypto_free_ahash(tfm); + desc->tfm = tfm; + desc->flags = 0; + + size = offsetof(struct e820_table, entries) + + sizeof(struct e820_entry) * table->nr_entries; + + if (crypto_shash_digest(desc, (u8 *)table, size, buf)) + ret = -EINVAL; + + kzfree(desc); + +free_tfm: + crypto_free_shash(tfm); return ret; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index ce8da3a0412c..fd369a6e9ff8 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -137,7 +137,7 @@ ENTRY(restore_registers) /* Saved in save_processor_state. */ lgdt saved_context_gdt_desc(%rax) - xorq %rax, %rax + xorl %eax, %eax /* tell the hibernation core that we've just restored the memory */ movq %rax, in_suspend(%rip) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 2e9ee023e6bc..3cf302b26332 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -6,7 +6,7 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) -$(obj)/sha256.o: $(srctree)/lib/sha256.c +$(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE $(call if_changed_rule,cc_o_c) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib @@ -28,9 +28,8 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE targets += kexec-purgatory.c -CMD_BIN2C = $(objtree)/scripts/basic/bin2c quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 220e97841e49..3a6c8ebc8032 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__tracedata_(start|end)|" "__(start|stop)_notes|" "__end_rodata|" + "__end_rodata_aligned|" "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 9d529f22fd9d..f518b4744ff8 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -1,13 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -mainmenu "User Mode Linux/$(SUBARCH) $(KERNELVERSION) Kernel Configuration" - -comment "Compiler: $(CC_VERSION_TEXT)" - -source "scripts/Kconfig.include" - -source "arch/um/Kconfig.common" - -menu "UML-specific options" menu "Host processor type and features" @@ -66,9 +57,3 @@ config ARCH_REUSE_HOST_VSYSCALL_AREA config GENERIC_HWEIGHT def_bool y - -source "arch/um/Kconfig.um" - -endmenu - -source "arch/um/Kconfig.rest" diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 744afdc18cf3..56c44d865f7b 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) if (!FIXADDR_USER_START) return 0; - gate_vma.vm_mm = NULL; + vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; diff --git a/arch/x86/um/vdso/.gitignore b/arch/x86/um/vdso/.gitignore index 9cac6d072199..f8b69d84238e 100644 --- a/arch/x86/um/vdso/.gitignore +++ b/arch/x86/um/vdso/.gitignore @@ -1,2 +1 @@ -vdso-syms.lds vdso.lds diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index b2d6967262b2..822ccdba93ad 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -53,22 +53,6 @@ $(vobjs): KBUILD_CFLAGS += $(CFL) CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage -targets += vdso-syms.lds -extra-$(VDSO64-y) += vdso-syms.lds - -# -# Match symbols in the DSO that look like VDSO*; produce a file of constants. -# -sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' -quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ -endef - -$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE - $(call if_changed,vdsosym) - # # The DSO images are built using a special linker script. # diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b5318505c69..2eeddd814653 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #endif #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/slab.h> #include <xen/features.h> #include <xen/page.h> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8d4e2e1ae60b..ee3b00c7acda 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -119,6 +119,27 @@ static void __init xen_banner(void) version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } + +static void __init xen_pv_init_platform(void) +{ + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */ + xen_vcpu_info_reset(0); + + /* pvclock is in shared info area */ + xen_init_time_ops(); +} + +static void __init xen_pv_guest_late_init(void) +{ +#ifndef CONFIG_SMP + /* Setup shared vcpu info for non-smp configurations */ + xen_setup_vcpu_info_placement(); +#endif +} + /* Check if running on Xen version (major, minor) or later */ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) @@ -947,34 +968,8 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) xen_write_msr_safe(msr, low, high); } -void xen_setup_shared_info(void) -{ - set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - xen_setup_mfn_list_list(); - - if (system_state == SYSTEM_BOOTING) { -#ifndef CONFIG_SMP - /* - * In UP this is as good a place as any to set up shared info. - * Limit this to boot only, at restore vcpu setup is done via - * xen_vcpu_restore(). - */ - xen_setup_vcpu_info_placement(); -#endif - /* - * Now that shared info is set up we can start using routines - * that point to pvclock area. - */ - xen_init_time_ops(); - } -} - /* This is called once we have the cpu_possible_mask */ -void __ref xen_setup_vcpu_info_placement(void) +void __init xen_setup_vcpu_info_placement(void) { int cpu; @@ -1207,12 +1202,20 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_features(); - xen_setup_machphys_mapping(); - /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; + xen_init_irq_ops(); + + /* + * Setup xen_vcpu early because it is needed for + * local_irq_disable(), irqs_disabled(), e.g. in printk(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); x86_platform.get_nmi_reason = xen_get_nmi_reason; @@ -1220,15 +1223,19 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + x86_init.hyper.init_platform = xen_pv_init_platform; + x86_init.hyper.guest_late_init = xen_pv_guest_late_init; /* * Set up some pagetable state before starting to set any ptes. */ + xen_setup_machphys_mapping(); xen_init_mmu_ops(); /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; + __default_kernel_pte_mask &= ~_PAGE_GLOBAL; /* * Prevent page tables from being allocated in highmem, even @@ -1249,20 +1256,12 @@ asmlinkage __visible void __init xen_start_kernel(void) get_cpu_cap(&boot_cpu_data); x86_configure_nx(); - xen_init_irq_ops(); + /* Determine virtual and physical address sizes */ + get_cpu_address_sizes(&boot_cpu_data); /* Let's presume PV guests always boot on vCPU with id 0. */ per_cpu(xen_vcpu_id, 0) = 0; - /* - * Setup xen_vcpu early because idt_setup_early_handler needs it for - * local_irq_disable(), irqs_disabled(). - * - * Don't do the full vcpu_info placement stuff until we have - * the cpu_possible_mask and a non-dummy shared_info. - */ - xen_vcpu_info_reset(0); - idt_setup_early_handler(); xen_init_capabilities(); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 74179852e46c..7515a19fd324 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -128,8 +128,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { void __init xen_init_irq_ops(void) { - /* For PVH we use default pv_irq_ops settings. */ - if (!xen_feature(XENFEAT_hvm_callback_vector)) - pv_irq_ops = xen_irq_ops; + pv_irq_ops = xen_irq_ops; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c30cabfda90..52206ad81e4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1230,8 +1230,7 @@ static void __init xen_pagetable_p2m_free(void) * We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or * xen_start_info->shared_info they are in going to crash. Fortunatly - * we have already revectored in xen_setup_kernel_pagetable and in - * xen_setup_shared_info. + * we have already revectored in xen_setup_kernel_pagetable. */ size = roundup(size, PMD_SIZE); @@ -1292,8 +1291,7 @@ static void __init xen_pagetable_init(void) /* Remap memory freed due to conflicts with E820 map */ xen_remap_memory(); - - xen_setup_shared_info(); + xen_setup_mfn_list_list(); } static void xen_write_cr2(unsigned long cr2) { diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index dc502ca8263e..2bce7958ce8b 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -80,9 +80,9 @@ void xen_mc_flush(void) and just do the call directly. */ mc = &b->entries[0]; - mc->result = privcmd_call(mc->op, - mc->args[0], mc->args[1], mc->args[2], - mc->args[3], mc->args[4]); + mc->result = xen_single_call(mc->op, mc->args[0], mc->args[1], + mc->args[2], mc->args[3], + mc->args[4]); ret = mc->result < 0; break; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cd97a62394e7..973f10e05211 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -130,6 +130,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); void __init xen_init_spinlocks(void) { + /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + xen_pvspin = false; + if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c index a2e0f110af56..8303b58c79a9 100644 --- a/arch/x86/xen/suspend_pv.c +++ b/arch/x86/xen/suspend_pv.c @@ -27,8 +27,9 @@ void xen_pv_pre_suspend(void) void xen_pv_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); - - xen_setup_shared_info(); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); + HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + xen_setup_mfn_list_list(); if (suspend_cancelled) { xen_start_info->store_mfn = diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index e0f1bcf01d63..c84f1e039d84 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -31,6 +31,8 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 +static u64 xen_sched_clock_offset __read_mostly; + /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) { @@ -40,7 +42,7 @@ static unsigned long xen_tsc_khz(void) return pvclock_tsc_khz(info); } -u64 xen_clocksource_read(void) +static u64 xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; u64 ret; @@ -57,6 +59,11 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } +static u64 xen_sched_clock(void) +{ + return xen_clocksource_read() - xen_sched_clock_offset; +} + static void xen_read_wallclock(struct timespec64 *ts) { struct shared_info *s = HYPERVISOR_shared_info; @@ -367,7 +374,7 @@ void xen_timer_resume(void) } static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_clocksource_read, + .sched_clock = xen_sched_clock, .steal_clock = xen_steal_clock, }; @@ -503,8 +510,9 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __ref xen_init_time_ops(void) +void __init xen_init_time_ops(void) { + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.timer_init = xen_time_init; @@ -542,11 +550,11 @@ void __init xen_hvm_init_time_ops(void) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { - printk(KERN_INFO "Xen doesn't support pvclock on HVM," - "disable pv timer\n"); + pr_info("Xen doesn't support pvclock on HVM, disable pv timer"); return; } + xen_sched_clock_offset = xen_clocksource_read(); pv_time_ops = xen_time_ops; x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3b34745d0a52..e78684597f57 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,7 +31,6 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); -void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); @@ -68,12 +67,11 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); -u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __ref xen_init_time_ops(void); -void __init xen_hvm_init_time_ops(void); +void xen_init_time_ops(void); +void xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); |